Commit 658ed0df54291b9bdcc2ac2ed94a29e598a022a1
1 parent
b688af3fe9
Exists in
master
upload
Showing
2 changed files
with
820 additions
and
13 deletions
Show diff stats
GPL22111.csv
data preparation.R
1 | 1 | ||
2 | 2 | ||
3 | #set the working directory | 3 | #set the working directory |
4 | #make it be the data file | 4 | #make it be the data file |
5 | setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data") | 5 | setwd("/home/qja0428/Dropbox/research/Biostat/data") |
6 | 6 | ||
7 | 7 | ||
8 | datafiles <- list.files() | 8 | datafiles <- list.files() |
9 | output <- data.frame() | 9 | output <- data.frame() |
10 | 10 | ||
11 | for (i in seq_along(datafiles)){ | 11 | for (i in seq_along(datafiles)){ |
12 | 12 | ||
13 | #load the data | 13 | #load the data |
14 | dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) | 14 | dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) |
15 | 15 | ||
16 | #find the row number that contain "^SAMPLE" | 16 | #find the row number that contain "^SAMPLE" |
17 | sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) | 17 | sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) |
18 | 18 | ||
19 | #find the row number that contain "!sample_table_begin" | 19 | #find the row number that contain "!sample_table_begin" |
20 | begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) | 20 | begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) |
21 | 21 | ||
22 | #find the row number that contain "!sample_table_end" | 22 | #find the row number that contain "!sample_table_end" |
23 | end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) | 23 | end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) |
24 | 24 | ||
25 | #find the row number that contain "sex" | 25 | #find the row number that contain "sex" |
26 | sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) | 26 | sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) |
27 | 27 | ||
28 | #find the row number that contain "age" | 28 | #find the row number that contain "age" |
29 | age <- grep("= age: ", dataset[,1],fixed = TRUE) | 29 | age <- grep("= age: ", dataset[,1],fixed = TRUE) |
30 | 30 | ||
31 | #find the row number that contain "" | 31 | #find the row number that contain "" |
32 | #you can define whatever other information you want in the data | 32 | #you can define whatever other information you want in the data |
33 | 33 | ||
34 | 34 | ||
35 | #find how many the platform_id | 35 | #find how many the platform_id |
36 | platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) | 36 | platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) |
37 | 37 | ||
38 | name <- c("sampleID","platform","gender","age") | ||
38 | 39 | ||
40 | #store all the information | ||
41 | infor <- data.frame(name) | ||
42 | |||
43 | #store all the data we need | ||
44 | gene <- data.frame() | ||
39 | 45 | ||
40 | for (j in seq_along(sampleID)) { | 46 | for (j in seq_along(sampleID)) { |
41 | #get the data | 47 | #get the data |
42 | tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] | 48 | tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] |
43 | tmp_data[,2] <- as.numeric(tmp_data[,2]) | 49 | tmp_data[,2] <- tmp_data[,2] |
44 | 50 | ||
45 | #get the exact information | 51 | #get the exact information |
46 | #get the sample# | 52 | #get the sample# |
47 | sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) | 53 | sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) |
48 | #get the platform | 54 | #get the platform |
49 | platform <- sub("!Sample_platform_id = ", | 55 | platform <- sub("!Sample_platform_id = ", |
50 | "", dataset[platID[j],1],fixed = TRUE) | 56 | "", dataset[platID[j],1],fixed = TRUE) |
51 | #get the gender | 57 | #get the gender |
52 | gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] | 58 | gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] |
53 | #get the age | 59 | #get the age |
54 | Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] | 60 | Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] |
55 | Age <- as.numeric(Age) | 61 | Age <- as.numeric(Age) |
56 | 62 | ||
63 | #store the information into the data frame | ||
64 | infor <- cbind(infor, c(sam, platform, gender, Age)) | ||
65 | |||
57 | #combine the data into temple data | 66 | #combine the data into temple data |
58 | n <- nrow(tmp_data) | 67 | if (j == 1){ |
59 | tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n), | 68 | gene <- tmp_data |
60 | rep(gender, n), rep(Age, n)) | 69 | } else { |
70 | gene <- cbind(gene, tmp_data[,2]) | ||
71 | } | ||
72 | |||
73 | |||
74 | |||
61 | 75 | ||
62 | #combine the temple data into output | ||
63 | output <- rbind(output, tmp_data) | ||
64 | 76 | ||
65 | } | 77 | } |
66 | 78 | ||
79 | #output the result we get | ||
80 | names(infor) <- "" | ||
81 | names(gene) <- "" | ||
82 | output <- rbind(infor, gene) | ||
83 | |||
84 | file_name <- paste0(platform,".csv") | ||
85 | location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name) | ||
86 | |||
87 | write.csv(output,location, row.names = FALSE) | ||
88 | |||
67 | 89 | ||
68 | #check how many the platform_id | 90 | #check how many the platform_id |
69 | # platform <- dataset[platID, 1] | 91 | # platform <- dataset[platID, 1] |
70 | # n_plat <- length(unique(platform)) | 92 | # n_plat <- length(unique(platform)) |
71 | 93 | ||
72 | 94 | ||
73 | # if (n_plat == 1){ | 95 | # if (n_plat == 1){ |
74 | # for (j in seq_along(sampleID)) { | 96 | # for (j in seq_along(sampleID)) { |
75 | 97 | ||
76 | #get the information for one sample | 98 | #get the information for one sample |
77 | # infor <- dataset[sampleID[j]:(begin[j]-1),1] | 99 | # infor <- dataset[sampleID[j]:(begin[j]-1),1] |
78 | # id <- infor | 100 | # id <- infor |
79 | 101 | ||
80 | # } | 102 | # } |
81 | # } else{ | 103 | # } else{ |
82 | 104 | ||
83 | # } | 105 | # } |
84 | 106 | ||
85 | } | 107 | } |
86 | 108 | ||
87 | #name the columns | ||
88 | names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age") | ||
89 | |||
90 | #save the output into local | ||
91 | setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning") | ||
92 | write.csv(output, "output.csv", row.names = F) | ||
93 | 109 | ||
94 | 110 | ||
95 | 111 | ||
96 | 112 | ||
97 | 113 | ||
98 | 114 | ||
99 | 115 | ||
100 | 116 | ||
101 | 117 |