Commit 658ed0df54291b9bdcc2ac2ed94a29e598a022a1

Authored by Jingan Qu
1 parent b688af3fe9
Exists in master

upload

Showing 2 changed files with 820 additions and 13 deletions   Show diff stats
1 1
2 2
3 #set the working directory 3 #set the working directory
4 #make it be the data file 4 #make it be the data file
5 setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data") 5 setwd("/home/qja0428/Dropbox/research/Biostat/data")
6 6
7 7
8 datafiles <- list.files() 8 datafiles <- list.files()
9 output <- data.frame() 9 output <- data.frame()
10 10
11 for (i in seq_along(datafiles)){ 11 for (i in seq_along(datafiles)){
12 12
13 #load the data 13 #load the data
14 dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) 14 dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F)
15 15
16 #find the row number that contain "^SAMPLE" 16 #find the row number that contain "^SAMPLE"
17 sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) 17 sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE)
18 18
19 #find the row number that contain "!sample_table_begin" 19 #find the row number that contain "!sample_table_begin"
20 begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) 20 begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE)
21 21
22 #find the row number that contain "!sample_table_end" 22 #find the row number that contain "!sample_table_end"
23 end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) 23 end <- grep("!sample_table_end", dataset[,1],fixed = TRUE)
24 24
25 #find the row number that contain "sex" 25 #find the row number that contain "sex"
26 sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) 26 sex <- grep("= Sex: ", dataset[,1],fixed = TRUE)
27 27
28 #find the row number that contain "age" 28 #find the row number that contain "age"
29 age <- grep("= age: ", dataset[,1],fixed = TRUE) 29 age <- grep("= age: ", dataset[,1],fixed = TRUE)
30 30
31 #find the row number that contain "" 31 #find the row number that contain ""
32 #you can define whatever other information you want in the data 32 #you can define whatever other information you want in the data
33 33
34 34
35 #find how many the platform_id 35 #find how many the platform_id
36 platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) 36 platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE)
37 37
38 name <- c("sampleID","platform","gender","age")
38 39
40 #store all the information
41 infor <- data.frame(name)
42
43 #store all the data we need
44 gene <- data.frame()
39 45
40 for (j in seq_along(sampleID)) { 46 for (j in seq_along(sampleID)) {
41 #get the data 47 #get the data
42 tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] 48 tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2]
43 tmp_data[,2] <- as.numeric(tmp_data[,2]) 49 tmp_data[,2] <- tmp_data[,2]
44 50
45 #get the exact information 51 #get the exact information
46 #get the sample# 52 #get the sample#
47 sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) 53 sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE)
48 #get the platform 54 #get the platform
49 platform <- sub("!Sample_platform_id = ", 55 platform <- sub("!Sample_platform_id = ",
50 "", dataset[platID[j],1],fixed = TRUE) 56 "", dataset[platID[j],1],fixed = TRUE)
51 #get the gender 57 #get the gender
52 gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] 58 gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2]
53 #get the age 59 #get the age
54 Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] 60 Age <- strsplit(dataset[age[j],1], ": ")[[1]][2]
55 Age <- as.numeric(Age) 61 Age <- as.numeric(Age)
56 62
63 #store the information into the data frame
64 infor <- cbind(infor, c(sam, platform, gender, Age))
65
57 #combine the data into temple data 66 #combine the data into temple data
58 n <- nrow(tmp_data) 67 if (j == 1){
59 tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n), 68 gene <- tmp_data
60 rep(gender, n), rep(Age, n)) 69 } else {
70 gene <- cbind(gene, tmp_data[,2])
71 }
72
73
74
61 75
62 #combine the temple data into output
63 output <- rbind(output, tmp_data)
64 76
65 } 77 }
66 78
79 #output the result we get
80 names(infor) <- ""
81 names(gene) <- ""
82 output <- rbind(infor, gene)
83
84 file_name <- paste0(platform,".csv")
85 location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name)
86
87 write.csv(output,location, row.names = FALSE)
88
67 89
68 #check how many the platform_id 90 #check how many the platform_id
69 # platform <- dataset[platID, 1] 91 # platform <- dataset[platID, 1]
70 # n_plat <- length(unique(platform)) 92 # n_plat <- length(unique(platform))
71 93
72 94
73 # if (n_plat == 1){ 95 # if (n_plat == 1){
74 # for (j in seq_along(sampleID)) { 96 # for (j in seq_along(sampleID)) {
75 97
76 #get the information for one sample 98 #get the information for one sample
77 # infor <- dataset[sampleID[j]:(begin[j]-1),1] 99 # infor <- dataset[sampleID[j]:(begin[j]-1),1]
78 # id <- infor 100 # id <- infor
79 101
80 # } 102 # }
81 # } else{ 103 # } else{
82 104
83 # } 105 # }
84 106
85 } 107 }
86 108
87 #name the columns
88 names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age")
89
90 #save the output into local
91 setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning")
92 write.csv(output, "output.csv", row.names = F)
93 109
94 110
95 111
96 112
97 113
98 114
99 115
100 116
101 117