Jingan Qu / r_code

Browse Code »

Commit 658ed0df54291b9bdcc2ac2ed94a29e598a022a1

Authored by Jingan Qu 2016-11-07 09:48:28 -0500

1 parent b688af3fe9

Exists in master

upload

Showing 2 changed files with 820 additions and 13 deletions Show diff stats

GPL22111.csv

Diff comments View file @ 658ed0d

data preparation.R

Diff comments View file @ 658ed0d

 #set the working directory
 #make it be the data file
-setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data")
+setwd("/home/qja0428/Dropbox/research/Biostat/data")
 datafiles <- list.files()
 output <- data.frame()
 for (i in seq_along(datafiles)){
   #load the data
   dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F)
   #find the row number that contain "^SAMPLE"
   sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE)
   #find the row number that contain "!sample_table_begin"
   begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE)
   #find the row number that contain "!sample_table_end"
   end <- grep("!sample_table_end", dataset[,1],fixed = TRUE)
   #find the row number that contain "sex"
   sex <- grep("= Sex: ", dataset[,1],fixed = TRUE)
   #find the row number that contain "age"
   age <- grep("= age: ", dataset[,1],fixed = TRUE)
   #find the row number that contain ""
   #you can define whatever other information you want in the data
   #find how many the platform_id
   platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE)
+  name <- c("sampleID","platform","gender","age")
+  #store all the information
+  infor <- data.frame(name)
+  #store all the data we need
+  gene <- data.frame()
   for (j in seq_along(sampleID)) {
     #get the data
     tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2]
-    tmp_data[,2] <- as.numeric(tmp_data[,2])
+    tmp_data[,2] <- tmp_data[,2]
     #get the exact information
     #get the sample#
     sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE)
     #get the platform
     platform <- sub("!Sample_platform_id = ",
                     "", dataset[platID[j],1],fixed = TRUE)
     #get the gender
     gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2]
     #get the age
     Age <- strsplit(dataset[age[j],1], ": ")[[1]][2]
     Age <- as.numeric(Age)
+    #store the information into the data frame
+    infor <- cbind(infor, c(sam, platform, gender, Age))
     #combine the data into temple data
-    n <- nrow(tmp_data)
+    if (j == 1){
-    tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n),
+      gene <- tmp_data
-                      rep(gender, n), rep(Age, n))
+    } else {
+      gene <- cbind(gene, tmp_data[,2])
+    }
-    #combine the temple data into output
-    output <- rbind(output, tmp_data)
   }
+  #output the result we get
+  names(infor) <- ""
+  names(gene) <- ""
+  output <- rbind(infor, gene)
+  file_name <- paste0(platform,".csv")
+  location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name)
+  write.csv(output,location, row.names = FALSE)
   #check how many the platform_id
 #  platform <- dataset[platID, 1]
   #  n_plat <- length(unique(platform))
   #  if (n_plat == 1){
   #   for (j in seq_along(sampleID)) {
       #get the information for one sample
   #    infor <- dataset[sampleID[j]:(begin[j]-1),1]
   #   id <- infor
   # }
   #  } else{
   #  }
 }
-#name the columns
-names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age")
-#save the output into local
-setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning")
-write.csv(output, "output.csv", row.names = F)