data preparation.R
2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#set the working directory
#make it be the data file
setwd("/home/qja0428/Dropbox/research/Biostat/data")
datafiles <- list.files()
output <- data.frame()
for (i in seq_along(datafiles)){
#load the data
dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F)
#find the row number that contain "^SAMPLE"
sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE)
#find the row number that contain "!sample_table_begin"
begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE)
#find the row number that contain "!sample_table_end"
end <- grep("!sample_table_end", dataset[,1],fixed = TRUE)
#find the row number that contain "sex"
sex <- grep("= Sex: ", dataset[,1],fixed = TRUE)
#find the row number that contain "age"
age <- grep("= age: ", dataset[,1],fixed = TRUE)
#find the row number that contain ""
#you can define whatever other information you want in the data
#find how many the platform_id
platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE)
name <- c("sampleID","platform","gender","age")
#store all the information
infor <- data.frame(name)
#store all the data we need
gene <- data.frame()
for (j in seq_along(sampleID)) {
#get the data
tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2]
tmp_data[,2] <- tmp_data[,2]
#get the exact information
#get the sample#
sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE)
#get the platform
platform <- sub("!Sample_platform_id = ",
"", dataset[platID[j],1],fixed = TRUE)
#get the gender
gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2]
#get the age
Age <- strsplit(dataset[age[j],1], ": ")[[1]][2]
Age <- as.numeric(Age)
#store the information into the data frame
infor <- cbind(infor, c(sam, platform, gender, Age))
#combine the data into temple data
if (j == 1){
gene <- tmp_data
} else {
gene <- cbind(gene, tmp_data[,2])
}
}
#output the result we get
names(infor) <- ""
names(gene) <- ""
output <- rbind(infor, gene)
file_name <- paste0(platform,".csv")
location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name)
write.csv(output,location, row.names = FALSE)
#check how many the platform_id
# platform <- dataset[platID, 1]
# n_plat <- length(unique(platform))
# if (n_plat == 1){
# for (j in seq_along(sampleID)) {
#get the information for one sample
# infor <- dataset[sampleID[j]:(begin[j]-1),1]
# id <- infor
# }
# } else{
# }
}