#set the working directory #make it be the data file setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data") datafiles <- list.files() output <- data.frame() for (i in seq_along(datafiles)){ #load the data dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) #find the row number that contain "^SAMPLE" sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) #find the row number that contain "!sample_table_begin" begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) #find the row number that contain "!sample_table_end" end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) #find the row number that contain "sex" sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) #find the row number that contain "age" age <- grep("= age: ", dataset[,1],fixed = TRUE) #find the row number that contain "" #you can define whatever other information you want in the data #find how many the platform_id platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) for (j in seq_along(sampleID)) { #get the data tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] tmp_data[,2] <- as.numeric(tmp_data[,2]) #get the exact information #get the sample# sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) #get the platform platform <- sub("!Sample_platform_id = ", "", dataset[platID[j],1],fixed = TRUE) #get the gender gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] #get the age Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] Age <- as.numeric(Age) #combine the data into temple data n <- nrow(tmp_data) tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n), rep(gender, n), rep(Age, n)) #combine the temple data into output output <- rbind(output, tmp_data) } #check how many the platform_id # platform <- dataset[platID, 1] # n_plat <- length(unique(platform)) # if (n_plat == 1){ # for (j in seq_along(sampleID)) { #get the information for one sample # infor <- dataset[sampleID[j]:(begin[j]-1),1] # id <- infor # } # } else{ # } } #name the columns names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age") #save the output into local setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning") write.csv(output, "output.csv", row.names = F)