Commit 788834dd790ca6d024f704be6cb685756857a503
1 parent
69cbaf694d
Exists in
master
This code takes the clean data and discretizes it
Showing
1 changed file
with
157 additions
and
0 deletions
Show diff stats
RPostClean.R
... | ... | @@ -0,0 +1,157 @@ |
1 | +#For Reading Raw Data from the created file | |
2 | + | |
3 | +#Required Libraries | |
4 | +library(MASS) | |
5 | +library(dplyr) | |
6 | +library(tidyr) | |
7 | +library(readr) | |
8 | +library(stringr) | |
9 | + | |
10 | + | |
11 | +#Necessary Functions | |
12 | + | |
13 | +#1# Function for discretizing the data | |
14 | +dndat <- function(NDATA){ | |
15 | + rownd <- dim(NDATA)[1] | |
16 | + colnd <- dim(NDATA)[2] | |
17 | + DDATA <- matrix(0,nrow=rownd,ncol=colnd) | |
18 | + colnames(DDATA) <- colnames(NDATA) | |
19 | + i = 1 | |
20 | + for(i in 1:rownd){ | |
21 | + for(j in 1:colnd){ | |
22 | + if(is.na(NDATA[i,j])==FALSE){ | |
23 | + | |
24 | + if(NDATA[i,j] < -1){ | |
25 | + DDATA[i,j]=0L | |
26 | + } | |
27 | + if(NDATA[i,j] > 1){ | |
28 | + DDATA[i,j]=2L | |
29 | + } | |
30 | + if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | |
31 | + DDATA[i,j]=1L | |
32 | + } | |
33 | + } else{ | |
34 | + DDATA[i,j] = NDATA[i,j] | |
35 | + } | |
36 | + j = j + 1 | |
37 | + } | |
38 | + i = i + 1 | |
39 | + } | |
40 | + DDATA | |
41 | +} | |
42 | + | |
43 | + | |
44 | +#Bringing in the file | |
45 | +rawdat <- file.choose() | |
46 | +RAWDAT <- rawdat %>% | |
47 | + read_delim(delim ="\t",col_names = FALSE,skip=1) %>% | |
48 | + filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) | |
49 | +attributes(RAWDAT)$names <- RAWDAT[1,] | |
50 | + | |
51 | +#Just the clinical data | |
52 | +RAWWORD <- rawdat %>% | |
53 | + read_delim(delim ="\t",col_names = FALSE,skip=1) %>% | |
54 | + filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) | |
55 | +attributes(RAWWORD)$names <- RAWDAT[1,] | |
56 | +#Add col of NAs to clinical data | |
57 | +z <- 1 | |
58 | +naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | |
59 | +for(z in 1:dim(RAWWORD)[1]){ | |
60 | + naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | |
61 | + z <- z + 1 | |
62 | +} | |
63 | +colnames(naroww) <- "ROW_NAs" | |
64 | +RAWWORD <- bind_cols(RAWWORD,naroww) | |
65 | + | |
66 | + | |
67 | +##Getting back to the data | |
68 | +RAWDAT2 <- RAWDAT[-1,] %>% | |
69 | + dplyr::arrange(.,ID_REF) | |
70 | + | |
71 | +##Editing the file for R processing | |
72 | +RAWDATID <- RAWDAT2[,1] %>% | |
73 | + as.matrix(.) | |
74 | +RAWDATNUM <- RAWDAT2[,-1] %>% | |
75 | + mapply(.,FUN = as.numeric) %>% | |
76 | + t(.) | |
77 | + | |
78 | +##Consolidating genes with the same name | |
79 | +tabRDATID <- table(RAWDATID) | |
80 | +NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | |
81 | +j <- 1 | |
82 | +for(j in 1:length(tabRDATID)){ | |
83 | + ##Putting the ones without duplicates in their new homes | |
84 | + if(tabRDATID[j] == 1){ | |
85 | + NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | |
86 | + } | |
87 | + ##Averaging duplicates and putting them in their new homes | |
88 | + if(tabRDATID[j] > 1){ | |
89 | + NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | |
90 | + } | |
91 | + j <- j + 1 | |
92 | +} | |
93 | + | |
94 | + | |
95 | +#Scaling the Data | |
96 | +scrawdat <- NuRDATN%>% | |
97 | + scale() | |
98 | +attr(scrawdat,"scaled:center") <- NULL | |
99 | +attr(scrawdat,"scaled:scale") <- NULL | |
100 | +colnames(scrawdat) <- rownames(tabRDATID) | |
101 | + | |
102 | + | |
103 | +#Discretized the Data | |
104 | +dialzdat <- scrawdat %>% | |
105 | + dndat(.) %>% | |
106 | + t()%>% | |
107 | + as.data.frame(.) | |
108 | +colnames(dialzdat) <- rownames(RAWDATNUM) | |
109 | + | |
110 | +#gene names | |
111 | +genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | |
112 | +#setting "ID_REF" as a new variable | |
113 | +colnames(genena) <- "ID_REF" | |
114 | +rownames(dialzdat) <- NULL | |
115 | +dialzdat <-bind_cols(genena,dialzdat) | |
116 | + | |
117 | +#NAs in a column | |
118 | +x <- 2 | |
119 | +nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | |
120 | +nacol[1,1] = "COL_NAs" | |
121 | +for(x in 2:dim(dialzdat)[2]){ | |
122 | + nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | |
123 | + x <- x + 1 | |
124 | +} | |
125 | +colnames(nacol) <- colnames(dialzdat) | |
126 | +dialzdat<-bind_rows(dialzdat,nacol) | |
127 | + | |
128 | +#NAs in a row | |
129 | +y <- 1 | |
130 | +narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | |
131 | +for(y in 1:dim(dialzdat)[1]){ | |
132 | + narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | |
133 | + y <- y + 1 | |
134 | +} | |
135 | +colnames(narowd) <- "ROW_NAs" | |
136 | +dialzdat <- bind_cols(dialzdat,narowd) | |
137 | + | |
138 | +#converting to character so that the clinical can be brought together with discrete data | |
139 | +k <- 2 | |
140 | +for(k in 2:dim(dialzdat)[2]-1){ | |
141 | + dialzdat[,k] <- as.character(dialzdat[,k]) | |
142 | + k <- k + 1 | |
143 | +} | |
144 | + | |
145 | + | |
146 | +#The End the full data we seem to have found Carmen | |
147 | +Fullalzdw <- bind_rows(RAWWORD,dialzdat) | |
148 | + | |
149 | +#Create the file | |
150 | +nfnaex <- strsplit(rawdat,"[\\|/]") %>% | |
151 | + .[[1]] %>% | |
152 | + .[length(.)] %>% | |
153 | + gsub("\\D","",.) %>% | |
154 | + c("GSE",.,"dscrt.txt") %>% | |
155 | + paste(collapse = "") | |
156 | +write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) | |
157 | + |