Commit 788834dd790ca6d024f704be6cb685756857a503

Authored by Efrain Gonzalez
1 parent 69cbaf694d
Exists in master

This code takes the clean data and discretizes it

Showing 1 changed file with 157 additions and 0 deletions   Show diff stats
File was created 1 #For Reading Raw Data from the created file
2
3 #Required Libraries
4 library(MASS)
5 library(dplyr)
6 library(tidyr)
7 library(readr)
8 library(stringr)
9
10
11 #Necessary Functions
12
13 #1# Function for discretizing the data
14 dndat <- function(NDATA){
15 rownd <- dim(NDATA)[1]
16 colnd <- dim(NDATA)[2]
17 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
18 colnames(DDATA) <- colnames(NDATA)
19 i = 1
20 for(i in 1:rownd){
21 for(j in 1:colnd){
22 if(is.na(NDATA[i,j])==FALSE){
23
24 if(NDATA[i,j] < -1){
25 DDATA[i,j]=0L
26 }
27 if(NDATA[i,j] > 1){
28 DDATA[i,j]=2L
29 }
30 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
31 DDATA[i,j]=1L
32 }
33 } else{
34 DDATA[i,j] = NDATA[i,j]
35 }
36 j = j + 1
37 }
38 i = i + 1
39 }
40 DDATA
41 }
42
43
44 #Bringing in the file
45 rawdat <- file.choose()
46 RAWDAT <- rawdat %>%
47 read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
48 filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
49 attributes(RAWDAT)$names <- RAWDAT[1,]
50
51 #Just the clinical data
52 RAWWORD <- rawdat %>%
53 read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
54 filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
55 attributes(RAWWORD)$names <- RAWDAT[1,]
56 #Add col of NAs to clinical data
57 z <- 1
58 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
59 for(z in 1:dim(RAWWORD)[1]){
60 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
61 z <- z + 1
62 }
63 colnames(naroww) <- "ROW_NAs"
64 RAWWORD <- bind_cols(RAWWORD,naroww)
65
66
67 ##Getting back to the data
68 RAWDAT2 <- RAWDAT[-1,] %>%
69 dplyr::arrange(.,ID_REF)
70
71 ##Editing the file for R processing
72 RAWDATID <- RAWDAT2[,1] %>%
73 as.matrix(.)
74 RAWDATNUM <- RAWDAT2[,-1] %>%
75 mapply(.,FUN = as.numeric) %>%
76 t(.)
77
78 ##Consolidating genes with the same name
79 tabRDATID <- table(RAWDATID)
80 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
81 j <- 1
82 for(j in 1:length(tabRDATID)){
83 ##Putting the ones without duplicates in their new homes
84 if(tabRDATID[j] == 1){
85 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
86 }
87 ##Averaging duplicates and putting them in their new homes
88 if(tabRDATID[j] > 1){
89 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
90 }
91 j <- j + 1
92 }
93
94
95 #Scaling the Data
96 scrawdat <- NuRDATN%>%
97 scale()
98 attr(scrawdat,"scaled:center") <- NULL
99 attr(scrawdat,"scaled:scale") <- NULL
100 colnames(scrawdat) <- rownames(tabRDATID)
101
102
103 #Discretized the Data
104 dialzdat <- scrawdat %>%
105 dndat(.) %>%
106 t()%>%
107 as.data.frame(.)
108 colnames(dialzdat) <- rownames(RAWDATNUM)
109
110 #gene names
111 genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
112 #setting "ID_REF" as a new variable
113 colnames(genena) <- "ID_REF"
114 rownames(dialzdat) <- NULL
115 dialzdat <-bind_cols(genena,dialzdat)
116
117 #NAs in a column
118 x <- 2
119 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
120 nacol[1,1] = "COL_NAs"
121 for(x in 2:dim(dialzdat)[2]){
122 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
123 x <- x + 1
124 }
125 colnames(nacol) <- colnames(dialzdat)
126 dialzdat<-bind_rows(dialzdat,nacol)
127
128 #NAs in a row
129 y <- 1
130 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
131 for(y in 1:dim(dialzdat)[1]){
132 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
133 y <- y + 1
134 }
135 colnames(narowd) <- "ROW_NAs"
136 dialzdat <- bind_cols(dialzdat,narowd)
137
138 #converting to character so that the clinical can be brought together with discrete data
139 k <- 2
140 for(k in 2:dim(dialzdat)[2]-1){
141 dialzdat[,k] <- as.character(dialzdat[,k])
142 k <- k + 1
143 }
144
145
146 #The End the full data we seem to have found Carmen
147 Fullalzdw <- bind_rows(RAWWORD,dialzdat)
148
149 #Create the file
150 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
151 .[[1]] %>%
152 .[length(.)] %>%
153 gsub("\\D","",.) %>%
154 c("GSE",.,"dscrt.txt") %>%
155 paste(collapse = "")
156 write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
157
158