Commit 658ed0df54291b9bdcc2ac2ed94a29e598a022a1
1 parent
b688af3fe9
Exists in
master
upload
Showing
2 changed files
with
820 additions
and
13 deletions
Show diff stats
GPL22111.csv
data preparation.R
| 1 | 1 | ||
| 2 | 2 | ||
| 3 | #set the working directory | 3 | #set the working directory |
| 4 | #make it be the data file | 4 | #make it be the data file |
| 5 | setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning/data") | 5 | setwd("/home/qja0428/Dropbox/research/Biostat/data") |
| 6 | 6 | ||
| 7 | 7 | ||
| 8 | datafiles <- list.files() | 8 | datafiles <- list.files() |
| 9 | output <- data.frame() | 9 | output <- data.frame() |
| 10 | 10 | ||
| 11 | for (i in seq_along(datafiles)){ | 11 | for (i in seq_along(datafiles)){ |
| 12 | 12 | ||
| 13 | #load the data | 13 | #load the data |
| 14 | dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) | 14 | dataset <- read.csv(datafiles[i],header = F,stringsAsFactors = F) |
| 15 | 15 | ||
| 16 | #find the row number that contain "^SAMPLE" | 16 | #find the row number that contain "^SAMPLE" |
| 17 | sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) | 17 | sampleID <- grep("^SAMPLE", dataset[,1],fixed = TRUE) |
| 18 | 18 | ||
| 19 | #find the row number that contain "!sample_table_begin" | 19 | #find the row number that contain "!sample_table_begin" |
| 20 | begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) | 20 | begin <- grep("!sample_table_begin", dataset[,1],fixed = TRUE) |
| 21 | 21 | ||
| 22 | #find the row number that contain "!sample_table_end" | 22 | #find the row number that contain "!sample_table_end" |
| 23 | end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) | 23 | end <- grep("!sample_table_end", dataset[,1],fixed = TRUE) |
| 24 | 24 | ||
| 25 | #find the row number that contain "sex" | 25 | #find the row number that contain "sex" |
| 26 | sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) | 26 | sex <- grep("= Sex: ", dataset[,1],fixed = TRUE) |
| 27 | 27 | ||
| 28 | #find the row number that contain "age" | 28 | #find the row number that contain "age" |
| 29 | age <- grep("= age: ", dataset[,1],fixed = TRUE) | 29 | age <- grep("= age: ", dataset[,1],fixed = TRUE) |
| 30 | 30 | ||
| 31 | #find the row number that contain "" | 31 | #find the row number that contain "" |
| 32 | #you can define whatever other information you want in the data | 32 | #you can define whatever other information you want in the data |
| 33 | 33 | ||
| 34 | 34 | ||
| 35 | #find how many the platform_id | 35 | #find how many the platform_id |
| 36 | platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) | 36 | platID <- grep("Sample_platform_id", dataset[,1],fixed = TRUE) |
| 37 | 37 | ||
| 38 | name <- c("sampleID","platform","gender","age") | ||
| 38 | 39 | ||
| 40 | #store all the information | ||
| 41 | infor <- data.frame(name) | ||
| 42 | |||
| 43 | #store all the data we need | ||
| 44 | gene <- data.frame() | ||
| 39 | 45 | ||
| 40 | for (j in seq_along(sampleID)) { | 46 | for (j in seq_along(sampleID)) { |
| 41 | #get the data | 47 | #get the data |
| 42 | tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] | 48 | tmp_data <- dataset[(begin[j]+2):(end[j]-1),1:2] |
| 43 | tmp_data[,2] <- as.numeric(tmp_data[,2]) | 49 | tmp_data[,2] <- tmp_data[,2] |
| 44 | 50 | ||
| 45 | #get the exact information | 51 | #get the exact information |
| 46 | #get the sample# | 52 | #get the sample# |
| 47 | sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) | 53 | sam <- sub("^SAMPLE = ","",dataset[sampleID[j],1],fixed = TRUE) |
| 48 | #get the platform | 54 | #get the platform |
| 49 | platform <- sub("!Sample_platform_id = ", | 55 | platform <- sub("!Sample_platform_id = ", |
| 50 | "", dataset[platID[j],1],fixed = TRUE) | 56 | "", dataset[platID[j],1],fixed = TRUE) |
| 51 | #get the gender | 57 | #get the gender |
| 52 | gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] | 58 | gender <- strsplit(dataset[sex[j],1], ": ")[[1]][2] |
| 53 | #get the age | 59 | #get the age |
| 54 | Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] | 60 | Age <- strsplit(dataset[age[j],1], ": ")[[1]][2] |
| 55 | Age <- as.numeric(Age) | 61 | Age <- as.numeric(Age) |
| 56 | 62 | ||
| 63 | #store the information into the data frame | ||
| 64 | infor <- cbind(infor, c(sam, platform, gender, Age)) | ||
| 65 | |||
| 57 | #combine the data into temple data | 66 | #combine the data into temple data |
| 58 | n <- nrow(tmp_data) | 67 | if (j == 1){ |
| 59 | tmp_data <- cbind(tmp_data, rep(sam,n), rep(platform,n), | 68 | gene <- tmp_data |
| 60 | rep(gender, n), rep(Age, n)) | 69 | } else { |
| 70 | gene <- cbind(gene, tmp_data[,2]) | ||
| 71 | } | ||
| 72 | |||
| 73 | |||
| 74 | |||
| 61 | 75 | ||
| 62 | #combine the temple data into output | ||
| 63 | output <- rbind(output, tmp_data) | ||
| 64 | 76 | ||
| 65 | } | 77 | } |
| 66 | 78 | ||
| 79 | #output the result we get | ||
| 80 | names(infor) <- "" | ||
| 81 | names(gene) <- "" | ||
| 82 | output <- rbind(infor, gene) | ||
| 83 | |||
| 84 | file_name <- paste0(platform,".csv") | ||
| 85 | location <- file.path("/home/qja0428/Dropbox/research/Biostat",file_name) | ||
| 86 | |||
| 87 | write.csv(output,location, row.names = FALSE) | ||
| 88 | |||
| 67 | 89 | ||
| 68 | #check how many the platform_id | 90 | #check how many the platform_id |
| 69 | # platform <- dataset[platID, 1] | 91 | # platform <- dataset[platID, 1] |
| 70 | # n_plat <- length(unique(platform)) | 92 | # n_plat <- length(unique(platform)) |
| 71 | 93 | ||
| 72 | 94 | ||
| 73 | # if (n_plat == 1){ | 95 | # if (n_plat == 1){ |
| 74 | # for (j in seq_along(sampleID)) { | 96 | # for (j in seq_along(sampleID)) { |
| 75 | 97 | ||
| 76 | #get the information for one sample | 98 | #get the information for one sample |
| 77 | # infor <- dataset[sampleID[j]:(begin[j]-1),1] | 99 | # infor <- dataset[sampleID[j]:(begin[j]-1),1] |
| 78 | # id <- infor | 100 | # id <- infor |
| 79 | 101 | ||
| 80 | # } | 102 | # } |
| 81 | # } else{ | 103 | # } else{ |
| 82 | 104 | ||
| 83 | # } | 105 | # } |
| 84 | 106 | ||
| 85 | } | 107 | } |
| 86 | 108 | ||
| 87 | #name the columns | ||
| 88 | names(output) <- c("ID_REF","Value","Sample","Platform","Gender","Age") | ||
| 89 | |||
| 90 | #save the output into local | ||
| 91 | setwd("/Users/jinganqu/Dropbox/Research/Biostat/data_cleaning") | ||
| 92 | write.csv(output, "output.csv", row.names = F) | ||
| 93 | 109 | ||
| 94 | 110 | ||
| 95 | 111 | ||
| 96 | 112 | ||
| 97 | 113 | ||
| 98 | 114 | ||
| 99 | 115 | ||
| 100 | 116 | ||
| 101 | 117 |