Commit 8bfefd7afece42806a3e35e41fabe6e2d817255b

Authored by Efrain Gonzalez
1 parent 788834dd79
Exists in master

Update

Showing 1 changed file with 1 additions and 1 deletions   Show diff stats
1 #For Reading Raw Data from the created file 1 #For Reading Raw Data from the created file
2 2
3 #Required Libraries 3 #Required Libraries
4 library(MASS) 4 library(MASS)
5 library(dplyr) 5 library(dplyr)
6 library(tidyr) 6 library(tidyr)
7 library(readr) 7 library(readr)
8 library(stringr) 8 library(stringr)
9 9
10 10
11 #Necessary Functions 11 #Necessary Functions
12 12
13 #1# Function for discretizing the data 13 #1# Function for discretizing the data
14 dndat <- function(NDATA){ 14 dndat <- function(NDATA){
15 rownd <- dim(NDATA)[1] 15 rownd <- dim(NDATA)[1]
16 colnd <- dim(NDATA)[2] 16 colnd <- dim(NDATA)[2]
17 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 17 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
18 colnames(DDATA) <- colnames(NDATA) 18 colnames(DDATA) <- colnames(NDATA)
19 i = 1 19 i = 1
20 for(i in 1:rownd){ 20 for(i in 1:rownd){
21 for(j in 1:colnd){ 21 for(j in 1:colnd){
22 if(is.na(NDATA[i,j])==FALSE){ 22 if(is.na(NDATA[i,j])==FALSE){
23 23
24 if(NDATA[i,j] < -1){ 24 if(NDATA[i,j] < -1){
25 DDATA[i,j]=0L 25 DDATA[i,j]=0L
26 } 26 }
27 if(NDATA[i,j] > 1){ 27 if(NDATA[i,j] > 1){
28 DDATA[i,j]=2L 28 DDATA[i,j]=2L
29 } 29 }
30 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 30 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
31 DDATA[i,j]=1L 31 DDATA[i,j]=1L
32 } 32 }
33 } else{ 33 } else{
34 DDATA[i,j] = NDATA[i,j] 34 DDATA[i,j] = NDATA[i,j]
35 } 35 }
36 j = j + 1 36 j = j + 1
37 } 37 }
38 i = i + 1 38 i = i + 1
39 } 39 }
40 DDATA 40 DDATA
41 } 41 }
42 42
43 43
44 #Bringing in the file 44 #Bringing in the file
45 rawdat <- file.choose() 45 rawdat <- file.choose()
46 RAWDAT <- rawdat %>% 46 RAWDAT <- rawdat %>%
47 read_delim(delim ="\t",col_names = FALSE,skip=1) %>% 47 read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
48 filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) 48 filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
49 attributes(RAWDAT)$names <- RAWDAT[1,] 49 attributes(RAWDAT)$names <- RAWDAT[1,]
50 50
51 #Just the clinical data 51 #Just the clinical data
52 RAWWORD <- rawdat %>% 52 RAWWORD <- rawdat %>%
53 read_delim(delim ="\t",col_names = FALSE,skip=1) %>% 53 read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
54 filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) 54 filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
55 attributes(RAWWORD)$names <- RAWDAT[1,] 55 attributes(RAWWORD)$names <- RAWDAT[1,]
56 #Add col of NAs to clinical data 56 #Add col of NAs to clinical data
57 z <- 1 57 z <- 1
58 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 58 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
59 for(z in 1:dim(RAWWORD)[1]){ 59 for(z in 1:dim(RAWWORD)[1]){
60 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 60 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
61 z <- z + 1 61 z <- z + 1
62 } 62 }
63 colnames(naroww) <- "ROW_NAs" 63 colnames(naroww) <- "ROW_NAs"
64 RAWWORD <- bind_cols(RAWWORD,naroww) 64 RAWWORD <- bind_cols(RAWWORD,naroww)
65 65
66 66
67 ##Getting back to the data 67 ##Getting back to the data
68 RAWDAT2 <- RAWDAT[-1,] %>% 68 RAWDAT2 <- RAWDAT[-1,] %>%
69 dplyr::arrange(.,ID_REF) 69 dplyr::arrange(.,ID_REF)
70 70
71 ##Editing the file for R processing 71 ##Editing the file for R processing
72 RAWDATID <- RAWDAT2[,1] %>% 72 RAWDATID <- RAWDAT2[,1] %>%
73 as.matrix(.) 73 as.matrix(.)
74 RAWDATNUM <- RAWDAT2[,-1] %>% 74 RAWDATNUM <- RAWDAT2[,-1] %>%
75 mapply(.,FUN = as.numeric) %>% 75 mapply(.,FUN = as.numeric) %>%
76 t(.) 76 t(.)
77 77
78 ##Consolidating genes with the same name 78 ##Consolidating genes with the same name
79 tabRDATID <- table(RAWDATID) 79 tabRDATID <- table(RAWDATID)
80 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 80 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
81 j <- 1 81 j <- 1
82 for(j in 1:length(tabRDATID)){ 82 for(j in 1:length(tabRDATID)){
83 ##Putting the ones without duplicates in their new homes 83 ##Putting the ones without duplicates in their new homes
84 if(tabRDATID[j] == 1){ 84 if(tabRDATID[j] == 1){
85 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 85 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
86 } 86 }
87 ##Averaging duplicates and putting them in their new homes 87 ##Averaging duplicates and putting them in their new homes
88 if(tabRDATID[j] > 1){ 88 if(tabRDATID[j] > 1){
89 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 89 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
90 } 90 }
91 j <- j + 1 91 j <- j + 1
92 } 92 }
93 93
94 94
95 #Scaling the Data 95 #Scaling the Data
96 scrawdat <- NuRDATN%>% 96 scrawdat <- NuRDATN%>%
97 scale() 97 scale()
98 attr(scrawdat,"scaled:center") <- NULL 98 attr(scrawdat,"scaled:center") <- NULL
99 attr(scrawdat,"scaled:scale") <- NULL 99 attr(scrawdat,"scaled:scale") <- NULL
100 colnames(scrawdat) <- rownames(tabRDATID) 100 colnames(scrawdat) <- rownames(tabRDATID)
101 101
102 102
103 #Discretized the Data 103 #Discretized the Data
104 dialzdat <- scrawdat %>% 104 dialzdat <- scrawdat %>%
105 dndat(.) %>% 105 dndat(.) %>%
106 t()%>% 106 t()%>%
107 as.data.frame(.) 107 as.data.frame(.)
108 colnames(dialzdat) <- rownames(RAWDATNUM) 108 colnames(dialzdat) <- rownames(RAWDATNUM)
109 109
110 #gene names 110 #gene names
111 genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 111 genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
112 #setting "ID_REF" as a new variable 112 #setting "ID_REF" as a new variable
113 colnames(genena) <- "ID_REF" 113 colnames(genena) <- "ID_REF"
114 rownames(dialzdat) <- NULL 114 rownames(dialzdat) <- NULL
115 dialzdat <-bind_cols(genena,dialzdat) 115 dialzdat <-bind_cols(genena,dialzdat)
116 116
117 #NAs in a column 117 #NAs in a column
118 x <- 2 118 x <- 2
119 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 119 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
120 nacol[1,1] = "COL_NAs" 120 nacol[1,1] = "COL_NAs"
121 for(x in 2:dim(dialzdat)[2]){ 121 for(x in 2:dim(dialzdat)[2]){
122 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 122 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
123 x <- x + 1 123 x <- x + 1
124 } 124 }
125 colnames(nacol) <- colnames(dialzdat) 125 colnames(nacol) <- colnames(dialzdat)
126 dialzdat<-bind_rows(dialzdat,nacol) 126 dialzdat<-bind_rows(dialzdat,nacol)
127 127
128 #NAs in a row 128 #NAs in a row
129 y <- 1 129 y <- 1
130 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 130 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
131 for(y in 1:dim(dialzdat)[1]){ 131 for(y in 1:dim(dialzdat)[1]){
132 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 132 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
133 y <- y + 1 133 y <- y + 1
134 } 134 }
135 colnames(narowd) <- "ROW_NAs" 135 colnames(narowd) <- "ROW_NAs"
136 dialzdat <- bind_cols(dialzdat,narowd) 136 dialzdat <- bind_cols(dialzdat,narowd)
137 137
138 #converting to character so that the clinical can be brought together with discrete data 138 #converting to character so that the clinical can be brought together with discrete data
139 k <- 2 139 k <- 2
140 for(k in 2:dim(dialzdat)[2]-1){ 140 for(k in 2:dim(dialzdat)[2]-1){
141 dialzdat[,k] <- as.character(dialzdat[,k]) 141 dialzdat[,k] <- as.character(dialzdat[,k])
142 k <- k + 1 142 k <- k + 1
143 } 143 }
144 144
145 145
146 #The End the full data we seem to have found Carmen 146 #The End the full data
147 Fullalzdw <- bind_rows(RAWWORD,dialzdat) 147 Fullalzdw <- bind_rows(RAWWORD,dialzdat)
148 148
149 #Create the file 149 #Create the file
150 nfnaex <- strsplit(rawdat,"[\\|/]") %>% 150 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
151 .[[1]] %>% 151 .[[1]] %>%
152 .[length(.)] %>% 152 .[length(.)] %>%
153 gsub("\\D","",.) %>% 153 gsub("\\D","",.) %>%
154 c("GSE",.,"dscrt.txt") %>% 154 c("GSE",.,"dscrt.txt") %>%
155 paste(collapse = "") 155 paste(collapse = "")
156 write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) 156 write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
157 157
158 158