Commit 2167ed7633805943435203e3a476affea2899bcb

Authored by Efrain Gonzalez
1 parent 9c467bfff8
Exists in master

Update

Showing 1 changed file with 1 additions and 0 deletions   Show diff stats
1 #For Reading Raw Data from the created file 1 #For Reading Raw Data from the created file
2 2
3 #Required Libraries 3 #Required Libraries
4 library(MASS) 4 library(MASS)
5 library(dplyr) 5 library(dplyr)
6 library(tidyr) 6 library(tidyr)
7 library(readr) 7 library(readr)
8 library(stringr) 8 library(stringr)
9 9
10 10
11 #Necessary Functions 11 #Necessary Functions
12 12
13 #1# Function for discretizing the data 13 #1# Function for discretizing the data
14 dndat <- function(NDATA){ 14 dndat <- function(NDATA){
15 rownd <- dim(NDATA)[1] 15 rownd <- dim(NDATA)[1]
16 colnd <- dim(NDATA)[2] 16 colnd <- dim(NDATA)[2]
17 DDATA <- matrix(0,nrow=rownd,ncol=colnd) 17 DDATA <- matrix(0,nrow=rownd,ncol=colnd)
18 colnames(DDATA) <- colnames(NDATA) 18 colnames(DDATA) <- colnames(NDATA)
19 i = 1 19 i = 1
20 for(i in 1:rownd){ 20 for(i in 1:rownd){
21 j <- 1
21 for(j in 1:colnd){ 22 for(j in 1:colnd){
22 if(is.na(NDATA[i,j])==FALSE){ 23 if(is.na(NDATA[i,j])==FALSE){
23 24
24 if(NDATA[i,j] < -1){ 25 if(NDATA[i,j] < -1){
25 DDATA[i,j]=0L 26 DDATA[i,j]=0L
26 } 27 }
27 if(NDATA[i,j] > 1){ 28 if(NDATA[i,j] > 1){
28 DDATA[i,j]=2L 29 DDATA[i,j]=2L
29 } 30 }
30 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ 31 if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
31 DDATA[i,j]=1L 32 DDATA[i,j]=1L
32 } 33 }
33 } else{ 34 } else{
34 DDATA[i,j] = NDATA[i,j] 35 DDATA[i,j] = NDATA[i,j]
35 } 36 }
36 j = j + 1 37 j = j + 1
37 } 38 }
38 i = i + 1 39 i = i + 1
39 } 40 }
40 DDATA 41 DDATA
41 } 42 }
42 43
43 44
44 #Bringing in the file 45 #Bringing in the file
45 rawdat <- file.choose() 46 rawdat <- file.choose()
46 RAWDAT <- rawdat %>% 47 RAWDAT <- rawdat %>%
47 read_delim(delim ="\t",col_names = FALSE,skip=1) %>% 48 read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
48 filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) 49 filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
49 attributes(RAWDAT)$names <- RAWDAT[1,] 50 attributes(RAWDAT)$names <- RAWDAT[1,]
50 51
51 #Just the clinical data 52 #Just the clinical data
52 RAWWORD <- rawdat %>% 53 RAWWORD <- rawdat %>%
53 read_delim(delim ="\t",col_names = FALSE,skip=1) %>% 54 read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
54 filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) 55 filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
55 attributes(RAWWORD)$names <- RAWDAT[1,] 56 attributes(RAWWORD)$names <- RAWDAT[1,]
56 #Add col of NAs to clinical data 57 #Add col of NAs to clinical data
57 z <- 1 58 z <- 1
58 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) 59 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
59 for(z in 1:dim(RAWWORD)[1]){ 60 for(z in 1:dim(RAWWORD)[1]){
60 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) 61 naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
61 z <- z + 1 62 z <- z + 1
62 } 63 }
63 colnames(naroww) <- "ROW_NAs" 64 colnames(naroww) <- "ROW_NAs"
64 RAWWORD <- bind_cols(RAWWORD,naroww) 65 RAWWORD <- bind_cols(RAWWORD,naroww)
65 66
66 67
67 ##Getting back to the data 68 ##Getting back to the data
68 RAWDAT2 <- RAWDAT[-1,] %>% 69 RAWDAT2 <- RAWDAT[-1,] %>%
69 dplyr::arrange(.,ID_REF) 70 dplyr::arrange(.,ID_REF)
70 71
71 ##Editing the file for R processing 72 ##Editing the file for R processing
72 RAWDATID <- RAWDAT2[,1] %>% 73 RAWDATID <- RAWDAT2[,1] %>%
73 as.matrix(.) 74 as.matrix(.)
74 RAWDATNUM <- RAWDAT2[,-1] %>% 75 RAWDATNUM <- RAWDAT2[,-1] %>%
75 mapply(.,FUN = as.numeric) %>% 76 mapply(.,FUN = as.numeric) %>%
76 t(.) 77 t(.)
77 78
78 ##Consolidating genes with the same name 79 ##Consolidating genes with the same name
79 tabRDATID <- table(RAWDATID) 80 tabRDATID <- table(RAWDATID)
80 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) 81 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
81 j <- 1 82 j <- 1
82 for(j in 1:length(tabRDATID)){ 83 for(j in 1:length(tabRDATID)){
83 ##Putting the ones without duplicates in their new homes 84 ##Putting the ones without duplicates in their new homes
84 if(tabRDATID[j] == 1){ 85 if(tabRDATID[j] == 1){
85 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] 86 NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
86 } 87 }
87 ##Averaging duplicates and putting them in their new homes 88 ##Averaging duplicates and putting them in their new homes
88 if(tabRDATID[j] > 1){ 89 if(tabRDATID[j] > 1){
89 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) 90 NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
90 } 91 }
91 j <- j + 1 92 j <- j + 1
92 } 93 }
93 94
94 95
95 #Scaling the Data 96 #Scaling the Data
96 scrawdat <- NuRDATN%>% 97 scrawdat <- NuRDATN%>%
97 scale() 98 scale()
98 attr(scrawdat,"scaled:center") <- NULL 99 attr(scrawdat,"scaled:center") <- NULL
99 attr(scrawdat,"scaled:scale") <- NULL 100 attr(scrawdat,"scaled:scale") <- NULL
100 colnames(scrawdat) <- rownames(tabRDATID) 101 colnames(scrawdat) <- rownames(tabRDATID)
101 102
102 103
103 #Discretized the Data 104 #Discretized the Data
104 dialzdat <- scrawdat %>% 105 dialzdat <- scrawdat %>%
105 dndat(.) %>% 106 dndat(.) %>%
106 t()%>% 107 t()%>%
107 as.data.frame(.) 108 as.data.frame(.)
108 colnames(dialzdat) <- rownames(RAWDATNUM) 109 colnames(dialzdat) <- rownames(RAWDATNUM)
109 110
110 #gene names 111 #gene names
111 genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) 112 genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
112 #setting "ID_REF" as a new variable 113 #setting "ID_REF" as a new variable
113 colnames(genena) <- "ID_REF" 114 colnames(genena) <- "ID_REF"
114 rownames(dialzdat) <- NULL 115 rownames(dialzdat) <- NULL
115 dialzdat <-bind_cols(genena,dialzdat) 116 dialzdat <-bind_cols(genena,dialzdat)
116 117
117 #NAs in a column 118 #NAs in a column
118 x <- 2 119 x <- 2
119 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) 120 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
120 nacol[1,1] = "COL_NAs" 121 nacol[1,1] = "COL_NAs"
121 for(x in 2:dim(dialzdat)[2]){ 122 for(x in 2:dim(dialzdat)[2]){
122 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) 123 nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
123 x <- x + 1 124 x <- x + 1
124 } 125 }
125 colnames(nacol) <- colnames(dialzdat) 126 colnames(nacol) <- colnames(dialzdat)
126 dialzdat<-bind_rows(dialzdat,nacol) 127 dialzdat<-bind_rows(dialzdat,nacol)
127 128
128 #NAs in a row 129 #NAs in a row
129 y <- 1 130 y <- 1
130 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) 131 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
131 for(y in 1:dim(dialzdat)[1]){ 132 for(y in 1:dim(dialzdat)[1]){
132 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) 133 narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
133 y <- y + 1 134 y <- y + 1
134 } 135 }
135 colnames(narowd) <- "ROW_NAs" 136 colnames(narowd) <- "ROW_NAs"
136 dialzdat <- bind_cols(dialzdat,narowd) 137 dialzdat <- bind_cols(dialzdat,narowd)
137 138
138 #converting to character so that the clinical can be brought together with discrete data 139 #converting to character so that the clinical can be brought together with discrete data
139 k <- 2 140 k <- 2
140 for(k in 2:dim(dialzdat)[2]-1){ 141 for(k in 2:dim(dialzdat)[2]-1){
141 dialzdat[,k] <- as.character(dialzdat[,k]) 142 dialzdat[,k] <- as.character(dialzdat[,k])
142 k <- k + 1 143 k <- k + 1
143 } 144 }
144 145
145 146
146 #The End the full data 147 #The End the full data
147 Fullalzdw <- bind_rows(RAWWORD,dialzdat) 148 Fullalzdw <- bind_rows(RAWWORD,dialzdat)
148 149
149 #Create the file 150 #Create the file
150 nfnaex <- strsplit(rawdat,"[\\|/]") %>% 151 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
151 .[[1]] %>% 152 .[[1]] %>%
152 .[length(.)] %>% 153 .[length(.)] %>%
153 gsub("\\D","",.) %>% 154 gsub("\\D","",.) %>%
154 c("GSE",.,"dscrt.txt") %>% 155 c("GSE",.,"dscrt.txt") %>%
155 paste(collapse = "") 156 paste(collapse = "")
156 write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) 157 write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
157 158
158 159