data preparation.R 2.63 KB
#set the working directory
#make it be the data file
setwd("/home/qja0428/Dropbox/research/Biostat/data")


datafiles <- list.files()
output <- data.frame()

for (i in seq_along(datafiles)){
  
  #load the data
  dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F)
  
  #find the row number that contain "^SAMPLE"
  sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) 
  
  #find the row number that contain "!sample_table_begin"
  begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) 
  
  #find the row number that contain "!sample_table_end"
  end <- grep("!sample_table_end", dataset[,1],fixed = TRUE)
  
  #find the row number that contain "sex"
  sex <- grep("= Sex: ", dataset[,1],fixed = TRUE)
  
  #find the row number that contain "age"
  age <- grep("= age: ", dataset[,1],fixed = TRUE)
  
  #find the row number that contain ""
  #you can define whatever other information you want in the data

  
  #find how many the platform_id
  platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE)
  
  name <- c("sampleID","platform","gender","age")
  
  #store all the information
  infor <- data.frame(name)
  
  #store all the data we need
  gene <- data.frame()
  
  for (j in seq_along(sampleID)) {
    #get the data
    tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2]
    tmp_data[,2] <- tmp_data[,2]
    
    #get the exact information
    #get the sample#
    sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE)
    #get the platform
    platform <- sub("!Sample_platform_id = ", 
                    "", dataset[platID[j],1],fixed = TRUE)
    #get the gender
    gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2]
    #get the age
    Age <- strsplit(dataset[age[j],1], ": ")[[1]][2]
    Age <- as.numeric(Age)
    
    #store the information into the data frame
    infor <- cbind(infor, c(sam, platform, gender, Age))
    
    #combine the data into temple data
    if (j == 1){
      gene <- tmp_data
    } else {
      gene <- cbind(gene, tmp_data[,2])
    }
    
    
    
    
    
  }
  
  #output the result we get
  names(infor) <- ""
  names(gene) <- ""
  output <- rbind(infor, gene)
  
  file_name <- paste0(platform,".csv")
  location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name)
  
  write.csv(output,location, row.names = FALSE)
  
  
  #check how many the platform_id
#  platform <- dataset[platID, 1]
  #  n_plat <- length(unique(platform))
  
  
  #  if (n_plat == 1){
  #   for (j in seq_along(sampleID)) {
      
      #get the information for one sample
  #    infor <- dataset[sampleID[j]:(begin[j]-1),1]
  #   id <- infor
      
  # }
  #  } else{
    
  #  }
  
}