Commit 2167ed7633805943435203e3a476affea2899bcb
1 parent
9c467bfff8
Exists in
master
Update
Showing
1 changed file
with
1 additions
and
0 deletions
Show diff stats
RPostClean.R
1 | #For Reading Raw Data from the created file | 1 | #For Reading Raw Data from the created file |
2 | 2 | ||
3 | #Required Libraries | 3 | #Required Libraries |
4 | library(MASS) | 4 | library(MASS) |
5 | library(dplyr) | 5 | library(dplyr) |
6 | library(tidyr) | 6 | library(tidyr) |
7 | library(readr) | 7 | library(readr) |
8 | library(stringr) | 8 | library(stringr) |
9 | 9 | ||
10 | 10 | ||
11 | #Necessary Functions | 11 | #Necessary Functions |
12 | 12 | ||
13 | #1# Function for discretizing the data | 13 | #1# Function for discretizing the data |
14 | dndat <- function(NDATA){ | 14 | dndat <- function(NDATA){ |
15 | rownd <- dim(NDATA)[1] | 15 | rownd <- dim(NDATA)[1] |
16 | colnd <- dim(NDATA)[2] | 16 | colnd <- dim(NDATA)[2] |
17 | DDATA <- matrix(0,nrow=rownd,ncol=colnd) | 17 | DDATA <- matrix(0,nrow=rownd,ncol=colnd) |
18 | colnames(DDATA) <- colnames(NDATA) | 18 | colnames(DDATA) <- colnames(NDATA) |
19 | i = 1 | 19 | i = 1 |
20 | for(i in 1:rownd){ | 20 | for(i in 1:rownd){ |
21 | j <- 1 | ||
21 | for(j in 1:colnd){ | 22 | for(j in 1:colnd){ |
22 | if(is.na(NDATA[i,j])==FALSE){ | 23 | if(is.na(NDATA[i,j])==FALSE){ |
23 | 24 | ||
24 | if(NDATA[i,j] < -1){ | 25 | if(NDATA[i,j] < -1){ |
25 | DDATA[i,j]=0L | 26 | DDATA[i,j]=0L |
26 | } | 27 | } |
27 | if(NDATA[i,j] > 1){ | 28 | if(NDATA[i,j] > 1){ |
28 | DDATA[i,j]=2L | 29 | DDATA[i,j]=2L |
29 | } | 30 | } |
30 | if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ | 31 | if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ |
31 | DDATA[i,j]=1L | 32 | DDATA[i,j]=1L |
32 | } | 33 | } |
33 | } else{ | 34 | } else{ |
34 | DDATA[i,j] = NDATA[i,j] | 35 | DDATA[i,j] = NDATA[i,j] |
35 | } | 36 | } |
36 | j = j + 1 | 37 | j = j + 1 |
37 | } | 38 | } |
38 | i = i + 1 | 39 | i = i + 1 |
39 | } | 40 | } |
40 | DDATA | 41 | DDATA |
41 | } | 42 | } |
42 | 43 | ||
43 | 44 | ||
44 | #Bringing in the file | 45 | #Bringing in the file |
45 | rawdat <- file.choose() | 46 | rawdat <- file.choose() |
46 | RAWDAT <- rawdat %>% | 47 | RAWDAT <- rawdat %>% |
47 | read_delim(delim ="\t",col_names = FALSE,skip=1) %>% | 48 | read_delim(delim ="\t",col_names = FALSE,skip=1) %>% |
48 | filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) | 49 | filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) |
49 | attributes(RAWDAT)$names <- RAWDAT[1,] | 50 | attributes(RAWDAT)$names <- RAWDAT[1,] |
50 | 51 | ||
51 | #Just the clinical data | 52 | #Just the clinical data |
52 | RAWWORD <- rawdat %>% | 53 | RAWWORD <- rawdat %>% |
53 | read_delim(delim ="\t",col_names = FALSE,skip=1) %>% | 54 | read_delim(delim ="\t",col_names = FALSE,skip=1) %>% |
54 | filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) | 55 | filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1)) |
55 | attributes(RAWWORD)$names <- RAWDAT[1,] | 56 | attributes(RAWWORD)$names <- RAWDAT[1,] |
56 | #Add col of NAs to clinical data | 57 | #Add col of NAs to clinical data |
57 | z <- 1 | 58 | z <- 1 |
58 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) | 59 | naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) |
59 | for(z in 1:dim(RAWWORD)[1]){ | 60 | for(z in 1:dim(RAWWORD)[1]){ |
60 | naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) | 61 | naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) |
61 | z <- z + 1 | 62 | z <- z + 1 |
62 | } | 63 | } |
63 | colnames(naroww) <- "ROW_NAs" | 64 | colnames(naroww) <- "ROW_NAs" |
64 | RAWWORD <- bind_cols(RAWWORD,naroww) | 65 | RAWWORD <- bind_cols(RAWWORD,naroww) |
65 | 66 | ||
66 | 67 | ||
67 | ##Getting back to the data | 68 | ##Getting back to the data |
68 | RAWDAT2 <- RAWDAT[-1,] %>% | 69 | RAWDAT2 <- RAWDAT[-1,] %>% |
69 | dplyr::arrange(.,ID_REF) | 70 | dplyr::arrange(.,ID_REF) |
70 | 71 | ||
71 | ##Editing the file for R processing | 72 | ##Editing the file for R processing |
72 | RAWDATID <- RAWDAT2[,1] %>% | 73 | RAWDATID <- RAWDAT2[,1] %>% |
73 | as.matrix(.) | 74 | as.matrix(.) |
74 | RAWDATNUM <- RAWDAT2[,-1] %>% | 75 | RAWDATNUM <- RAWDAT2[,-1] %>% |
75 | mapply(.,FUN = as.numeric) %>% | 76 | mapply(.,FUN = as.numeric) %>% |
76 | t(.) | 77 | t(.) |
77 | 78 | ||
78 | ##Consolidating genes with the same name | 79 | ##Consolidating genes with the same name |
79 | tabRDATID <- table(RAWDATID) | 80 | tabRDATID <- table(RAWDATID) |
80 | NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) | 81 | NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) |
81 | j <- 1 | 82 | j <- 1 |
82 | for(j in 1:length(tabRDATID)){ | 83 | for(j in 1:length(tabRDATID)){ |
83 | ##Putting the ones without duplicates in their new homes | 84 | ##Putting the ones without duplicates in their new homes |
84 | if(tabRDATID[j] == 1){ | 85 | if(tabRDATID[j] == 1){ |
85 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] | 86 | NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] |
86 | } | 87 | } |
87 | ##Averaging duplicates and putting them in their new homes | 88 | ##Averaging duplicates and putting them in their new homes |
88 | if(tabRDATID[j] > 1){ | 89 | if(tabRDATID[j] > 1){ |
89 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) | 90 | NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) |
90 | } | 91 | } |
91 | j <- j + 1 | 92 | j <- j + 1 |
92 | } | 93 | } |
93 | 94 | ||
94 | 95 | ||
95 | #Scaling the Data | 96 | #Scaling the Data |
96 | scrawdat <- NuRDATN%>% | 97 | scrawdat <- NuRDATN%>% |
97 | scale() | 98 | scale() |
98 | attr(scrawdat,"scaled:center") <- NULL | 99 | attr(scrawdat,"scaled:center") <- NULL |
99 | attr(scrawdat,"scaled:scale") <- NULL | 100 | attr(scrawdat,"scaled:scale") <- NULL |
100 | colnames(scrawdat) <- rownames(tabRDATID) | 101 | colnames(scrawdat) <- rownames(tabRDATID) |
101 | 102 | ||
102 | 103 | ||
103 | #Discretized the Data | 104 | #Discretized the Data |
104 | dialzdat <- scrawdat %>% | 105 | dialzdat <- scrawdat %>% |
105 | dndat(.) %>% | 106 | dndat(.) %>% |
106 | t()%>% | 107 | t()%>% |
107 | as.data.frame(.) | 108 | as.data.frame(.) |
108 | colnames(dialzdat) <- rownames(RAWDATNUM) | 109 | colnames(dialzdat) <- rownames(RAWDATNUM) |
109 | 110 | ||
110 | #gene names | 111 | #gene names |
111 | genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) | 112 | genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) |
112 | #setting "ID_REF" as a new variable | 113 | #setting "ID_REF" as a new variable |
113 | colnames(genena) <- "ID_REF" | 114 | colnames(genena) <- "ID_REF" |
114 | rownames(dialzdat) <- NULL | 115 | rownames(dialzdat) <- NULL |
115 | dialzdat <-bind_cols(genena,dialzdat) | 116 | dialzdat <-bind_cols(genena,dialzdat) |
116 | 117 | ||
117 | #NAs in a column | 118 | #NAs in a column |
118 | x <- 2 | 119 | x <- 2 |
119 | nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) | 120 | nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) |
120 | nacol[1,1] = "COL_NAs" | 121 | nacol[1,1] = "COL_NAs" |
121 | for(x in 2:dim(dialzdat)[2]){ | 122 | for(x in 2:dim(dialzdat)[2]){ |
122 | nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) | 123 | nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) |
123 | x <- x + 1 | 124 | x <- x + 1 |
124 | } | 125 | } |
125 | colnames(nacol) <- colnames(dialzdat) | 126 | colnames(nacol) <- colnames(dialzdat) |
126 | dialzdat<-bind_rows(dialzdat,nacol) | 127 | dialzdat<-bind_rows(dialzdat,nacol) |
127 | 128 | ||
128 | #NAs in a row | 129 | #NAs in a row |
129 | y <- 1 | 130 | y <- 1 |
130 | narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) | 131 | narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) |
131 | for(y in 1:dim(dialzdat)[1]){ | 132 | for(y in 1:dim(dialzdat)[1]){ |
132 | narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) | 133 | narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) |
133 | y <- y + 1 | 134 | y <- y + 1 |
134 | } | 135 | } |
135 | colnames(narowd) <- "ROW_NAs" | 136 | colnames(narowd) <- "ROW_NAs" |
136 | dialzdat <- bind_cols(dialzdat,narowd) | 137 | dialzdat <- bind_cols(dialzdat,narowd) |
137 | 138 | ||
138 | #converting to character so that the clinical can be brought together with discrete data | 139 | #converting to character so that the clinical can be brought together with discrete data |
139 | k <- 2 | 140 | k <- 2 |
140 | for(k in 2:dim(dialzdat)[2]-1){ | 141 | for(k in 2:dim(dialzdat)[2]-1){ |
141 | dialzdat[,k] <- as.character(dialzdat[,k]) | 142 | dialzdat[,k] <- as.character(dialzdat[,k]) |
142 | k <- k + 1 | 143 | k <- k + 1 |
143 | } | 144 | } |
144 | 145 | ||
145 | 146 | ||
146 | #The End the full data | 147 | #The End the full data |
147 | Fullalzdw <- bind_rows(RAWWORD,dialzdat) | 148 | Fullalzdw <- bind_rows(RAWWORD,dialzdat) |
148 | 149 | ||
149 | #Create the file | 150 | #Create the file |
150 | nfnaex <- strsplit(rawdat,"[\\|/]") %>% | 151 | nfnaex <- strsplit(rawdat,"[\\|/]") %>% |
151 | .[[1]] %>% | 152 | .[[1]] %>% |
152 | .[length(.)] %>% | 153 | .[length(.)] %>% |
153 | gsub("\\D","",.) %>% | 154 | gsub("\\D","",.) %>% |
154 | c("GSE",.,"dscrt.txt") %>% | 155 | c("GSE",.,"dscrt.txt") %>% |
155 | paste(collapse = "") | 156 | paste(collapse = "") |
156 | write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) | 157 | write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE) |
157 | 158 | ||
158 | 159 |