Commit 788834dd790ca6d024f704be6cb685756857a503
1 parent
69cbaf694d
Exists in
master
This code takes the clean data and discretizes it
Showing
1 changed file
with
157 additions
and
0 deletions
 
Show diff stats
RPostClean.R
| ... | ... | @@ -0,0 +1,157 @@ | 
| 1 | +#For Reading Raw Data from the created file | |
| 2 | + | |
| 3 | +#Required Libraries | |
| 4 | +library(MASS) | |
| 5 | +library(dplyr) | |
| 6 | +library(tidyr) | |
| 7 | +library(readr) | |
| 8 | +library(stringr) | |
| 9 | + | |
| 10 | + | |
| 11 | +#Necessary Functions | |
| 12 | + | |
| 13 | +#1# Function for discretizing the data | |
| 14 | +dndat <- function(NDATA){ | |
| 15 | + rownd <- dim(NDATA)[1] | |
| 16 | + colnd <- dim(NDATA)[2] | |
| 17 | + DDATA <- matrix(0,nrow=rownd,ncol=colnd) | |
| 18 | + colnames(DDATA) <- colnames(NDATA) | |
| 19 | + i = 1 | |
| 20 | + for(i in 1:rownd){ | |
| 21 | + for(j in 1:colnd){ | |
| 22 | + if(is.na(NDATA[i,j])==FALSE){ | |
| 23 | + | |
| 24 | + if(NDATA[i,j] < -1){ | |
| 25 | + DDATA[i,j]=0L | |
| 26 | + } | |
| 27 | + if(NDATA[i,j] > 1){ | |
| 28 | + DDATA[i,j]=2L | |
| 29 | + } | |
| 30 | + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
| 31 | + DDATA[i,j]=1L | |
| 32 | + } | |
| 33 | + } else{ | |
| 34 | + DDATA[i,j] = NDATA[i,j] | |
| 35 | + } | |
| 36 | + j = j + 1 | |
| 37 | + } | |
| 38 | + i = i + 1 | |
| 39 | + } | |
| 40 | + DDATA | |
| 41 | +} | |
| 42 | + | |
| 43 | + | |
| 44 | +#Bringing in the file | |
| 45 | +rawdat <- file.choose() | |
| 46 | +RAWDAT <- rawdat %>% | |
| 47 | + read_delim(delim ="\t",col_names = FALSE,skip=1) %>% | |
| 48 | + filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) | |
| 49 | +attributes(RAWDAT)$names <- RAWDAT[1,] | |
| 50 | + | |
| 51 | +#Just the clinical data | |
| 52 | +RAWWORD <- rawdat %>% | |
| 53 | + read_delim(delim ="\t",col_names = FALSE,skip=1) %>% | |
| 54 | + filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) | |
| 55 | +attributes(RAWWORD)$names <- RAWDAT[1,] | |
| 56 | +#Add col of NAs to clinical data | |
| 57 | +z <- 1 | |
| 58 | +naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
| 59 | +for(z in 1:dim(RAWWORD)[1]){ | |
| 60 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
| 61 | + z <- z + 1 | |
| 62 | +} | |
| 63 | +colnames(naroww) <- "ROW_NAs" | |
| 64 | +RAWWORD <- bind_cols(RAWWORD,naroww) | |
| 65 | + | |
| 66 | + | |
| 67 | +##Getting back to the data | |
| 68 | +RAWDAT2 <- RAWDAT[-1,] %>% | |
| 69 | + dplyr::arrange(.,ID_REF) | |
| 70 | + | |
| 71 | +##Editing the file for R processing | |
| 72 | +RAWDATID <- RAWDAT2[,1] %>% | |
| 73 | + as.matrix(.) | |
| 74 | +RAWDATNUM <- RAWDAT2[,-1] %>% | |
| 75 | + mapply(.,FUN = as.numeric) %>% | |
| 76 | + t(.) | |
| 77 | + | |
| 78 | +##Consolidating genes with the same name | |
| 79 | +tabRDATID <- table(RAWDATID) | |
| 80 | +NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
| 81 | +j <- 1 | |
| 82 | +for(j in 1:length(tabRDATID)){ | |
| 83 | + ##Putting the ones without duplicates in their new homes | |
| 84 | + if(tabRDATID[j] == 1){ | |
| 85 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
| 86 | + } | |
| 87 | + ##Averaging duplicates and putting them in their new homes | |
| 88 | + if(tabRDATID[j] > 1){ | |
| 89 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
| 90 | + } | |
| 91 | + j <- j + 1 | |
| 92 | +} | |
| 93 | + | |
| 94 | + | |
| 95 | +#Scaling the Data | |
| 96 | +scrawdat <- NuRDATN%>% | |
| 97 | + scale() | |
| 98 | +attr(scrawdat,"scaled:center") <- NULL | |
| 99 | +attr(scrawdat,"scaled:scale") <- NULL | |
| 100 | +colnames(scrawdat) <- rownames(tabRDATID) | |
| 101 | + | |
| 102 | + | |
| 103 | +#Discretized the Data | |
| 104 | +dialzdat <- scrawdat %>% | |
| 105 | + dndat(.) %>% | |
| 106 | + t()%>% | |
| 107 | + as.data.frame(.) | |
| 108 | +colnames(dialzdat) <- rownames(RAWDATNUM) | |
| 109 | + | |
| 110 | +#gene names | |
| 111 | +genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
| 112 | +#setting "ID_REF" as a new variable | |
| 113 | +colnames(genena) <- "ID_REF" | |
| 114 | +rownames(dialzdat) <- NULL | |
| 115 | +dialzdat <-bind_cols(genena,dialzdat) | |
| 116 | + | |
| 117 | +#NAs in a column | |
| 118 | +x <- 2 | |
| 119 | +nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
| 120 | +nacol[1,1] = "COL_NAs" | |
| 121 | +for(x in 2:dim(dialzdat)[2]){ | |
| 122 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
| 123 | + x <- x + 1 | |
| 124 | +} | |
| 125 | +colnames(nacol) <- colnames(dialzdat) | |
| 126 | +dialzdat<-bind_rows(dialzdat,nacol) | |
| 127 | + | |
| 128 | +#NAs in a row | |
| 129 | +y <- 1 | |
| 130 | +narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
| 131 | +for(y in 1:dim(dialzdat)[1]){ | |
| 132 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
| 133 | + y <- y + 1 | |
| 134 | +} | |
| 135 | +colnames(narowd) <- "ROW_NAs" | |
| 136 | +dialzdat <- bind_cols(dialzdat,narowd) | |
| 137 | + | |
| 138 | +#converting to character so that the clinical can be brought together with discrete data | |
| 139 | +k <- 2 | |
| 140 | +for(k in 2:dim(dialzdat)[2]-1){ | |
| 141 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
| 142 | + k <- k + 1 | |
| 143 | +} | |
| 144 | + | |
| 145 | + | |
| 146 | +#The End the full data we seem to have found Carmen | |
| 147 | +Fullalzdw <- bind_rows(RAWWORD,dialzdat) | |
| 148 | + | |
| 149 | +#Create the file | |
| 150 | +nfnaex <- strsplit(rawdat,"[\\|/]") %>% | |
| 151 | + .[[1]] %>% | |
| 152 | + .[length(.)] %>% | |
| 153 | + gsub("\\D","",.) %>% | |
| 154 | + c("GSE",.,"dscrt.txt") %>% | |
| 155 | + paste(collapse = "") | |
| 156 | +write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) | |
| 157 | + |