#set the working directory #make it be the data file setwd("/home/qja0428/Dropbox/research/Biostat/data") datafiles <- list.files() output <- data.frame() for (i in seq_along(datafiles)){ #load the data dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) #find the row number that contain "^SAMPLE" sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) #find the row number that contain "!sample_table_begin" begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) #find the row number that contain "!sample_table_end" end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) #find the row number that contain "sex" sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) #find the row number that contain "age" age <- grep("= age: ", dataset[,1],fixed = TRUE) #find the row number that contain "" #you can define whatever other information you want in the data #find how many the platform_id platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) name <- c("sampleID","platform","gender","age") #store all the information infor <- data.frame(name) #store all the data we need gene <- data.frame() for (j in seq_along(sampleID)) { #get the data tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] tmp_data[,2] <- tmp_data[,2] #get the exact information #get the sample# sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) #get the platform platform <- sub("!Sample_platform_id = ", "", dataset[platID[j],1],fixed = TRUE) #get the gender gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] #get the age Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] Age <- as.numeric(Age) #store the information into the data frame infor <- cbind(infor, c(sam, platform, gender, Age)) #combine the data into temple data if (j == 1){ gene <- tmp_data } else { gene <- cbind(gene, tmp_data[,2]) } } #output the result we get names(infor) <- "" names(gene) <- "" output <- rbind(infor, gene) file_name <- paste0(platform,".csv") location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name) write.csv(output,location, row.names = FALSE) #check how many the platform_id # platform <- dataset[platID, 1] # n_plat <- length(unique(platform)) # if (n_plat == 1){ # for (j in seq_along(sampleID)) { #get the information for one sample # infor <- dataset[sampleID[j]:(begin[j]-1),1] # id <- infor # } # } else{ # } }