data preparation.R
2.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#set the working directory
#make it be the data file
setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data")
datafiles <- list.files()
output <- data.frame()
for (i in seq_along(datafiles)){
#load the data
dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F)
#find the row number that contain "^SAMPLE"
sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE)
#find the row number that contain "!sample_table_begin"
begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE)
#find the row number that contain "!sample_table_end"
end <- grep("!sample_table_end", dataset[,1],fixed = TRUE)
#find the row number that contain "sex"
sex <- grep("= Sex: ", dataset[,1],fixed = TRUE)
#find the row number that contain "age"
age <- grep("= age: ", dataset[,1],fixed = TRUE)
#find the row number that contain ""
#you can define whatever other information you want in the data
#find how many the platform_id
platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE)
for (j in seq_along(sampleID)) {
#get the data
tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2]
tmp_data[,2] <- as.numeric(tmp_data[,2])
#get the exact information
#get the sample#
sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE)
#get the platform
platform <- sub("!Sample_platform_id = ",
"", dataset[platID[j],1],fixed = TRUE)
#get the gender
gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2]
#get the age
Age <- strsplit(dataset[age[j],1], ": ")[[1]][2]
Age <- as.numeric(Age)
#combine the data into temple data
n <- nrow(tmp_data)
tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n),
rep(gender, n), rep(Age, n))
#combine the temple data into output
output <- rbind(output, tmp_data)
}
#check how many the platform_id
# platform <- dataset[platID, 1]
# n_plat <- length(unique(platform))
# if (n_plat == 1){
# for (j in seq_along(sampleID)) {
#get the information for one sample
# infor <- dataset[sampleID[j]:(begin[j]-1),1]
# id <- infor
# }
# } else{
# }
}
#name the columns
names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age")
#save the output into local
setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning")
write.csv(output, "output.csv", row.names = F)