Blame view

RPostClean.R 3.64 KB
788834dd7   Efrain Gonzalez   This code takes t...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
  #For Reading Raw Data from the created file
  
  #Required Libraries
  library(MASS)
  library(dplyr)
  library(tidyr)
  library(readr)
  library(stringr)
  
  
  #Necessary Functions
  
  #1# Function for discretizing the data 
  dndat <- function(NDATA){
  	rownd <- dim(NDATA)[1]
  	colnd <- dim(NDATA)[2]
  	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
  	colnames(DDATA) <- colnames(NDATA)
  	i = 1
  	for(i in 1:rownd){
2167ed763   Efrain Gonzalez   Update
21
  	    j <- 1
788834dd7   Efrain Gonzalez   This code takes t...
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
  		for(j in 1:colnd){
  			if(is.na(NDATA[i,j])==FALSE){
  			
  				if(NDATA[i,j] < -1){
  					DDATA[i,j]=0L
  				}
  				if(NDATA[i,j] > 1){
  					DDATA[i,j]=2L
  				}
  				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
  					DDATA[i,j]=1L
  				}
  			} else{
  				DDATA[i,j] = NDATA[i,j]
  			}
  			j = j + 1			
  		}
  		i = i + 1
  	}
  	DDATA
  }
  
  
  #Bringing in the file
  rawdat <- file.choose()
  RAWDAT <- rawdat %>% 
  	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
  	filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
  attributes(RAWDAT)$names <- RAWDAT[1,]
  
  #Just the clinical data
  RAWWORD <- rawdat %>%
  	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
  	filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
  attributes(RAWWORD)$names <- RAWDAT[1,]
  #Add col of NAs to clinical data
  z <- 1
  naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
  for(z in 1:dim(RAWWORD)[1]){
  	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
  	z <- z + 1
  }
  colnames(naroww) <- "ROW_NAs"
  RAWWORD <- bind_cols(RAWWORD,naroww)
  
  
  ##Getting back to the data
  RAWDAT2 <- RAWDAT[-1,] %>%
  	dplyr::arrange(.,ID_REF)
  
  ##Editing the file for R processing
  RAWDATID <- RAWDAT2[,1] %>%
  	as.matrix(.)
  RAWDATNUM <- RAWDAT2[,-1] %>%
  	mapply(.,FUN = as.numeric) %>%
  	t(.)
  
  ##Consolidating genes with the same name
  tabRDATID <- table(RAWDATID)
  NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
  j <- 1
  for(j in 1:length(tabRDATID)){
  	##Putting the ones without duplicates in their new homes
  	if(tabRDATID[j] == 1){
  		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
  	}
  	##Averaging duplicates and putting them in their new homes
  	if(tabRDATID[j] > 1){
  		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
  	}
  	j <- j + 1
  }
  
  
  #Scaling the Data
  scrawdat <- NuRDATN%>%
  	scale()
  attr(scrawdat,"scaled:center") <- NULL
  attr(scrawdat,"scaled:scale") <- NULL
  colnames(scrawdat) <- rownames(tabRDATID)
  
  
  #Discretized the Data
  dialzdat <- scrawdat %>%
  	dndat(.) %>%
  	t()%>%
  	as.data.frame(.)
  colnames(dialzdat) <- rownames(RAWDATNUM)
  
  #gene names
  genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
  #setting "ID_REF" as a new variable
  colnames(genena) <- "ID_REF"
  rownames(dialzdat) <- NULL
  dialzdat <-bind_cols(genena,dialzdat)
  
  #NAs in a column
  x <- 2
  nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
  nacol[1,1] = "COL_NAs"
  for(x in 2:dim(dialzdat)[2]){
  	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
  	x <- x + 1
  }
  colnames(nacol) <- colnames(dialzdat)
  dialzdat<-bind_rows(dialzdat,nacol)
  
  #NAs in a row
  y <- 1
  narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
  for(y in 1:dim(dialzdat)[1]){
  	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
  	y <- y + 1
  }
  colnames(narowd) <- "ROW_NAs"
  dialzdat <- bind_cols(dialzdat,narowd)
  
  #converting to character so that the clinical can be brought together with discrete data
  k <- 2
  for(k in 2:dim(dialzdat)[2]-1){
  	dialzdat[,k] <- as.character(dialzdat[,k])
  	k <- k + 1
  }
8bfefd7af   Efrain Gonzalez   Update
145
  #The End the full data
788834dd7   Efrain Gonzalez   This code takes t...
146
147
148
149
150
151
152
153
154
155
  Fullalzdw <- bind_rows(RAWWORD,dialzdat)
  
  #Create the file
  nfnaex <- strsplit(rawdat,"[\\|/]") %>%
  	.[[1]] %>%
  	.[length(.)] %>%
  	gsub("\\D","",.) %>%
  	c("GSE",.,"dscrt.txt") %>%
  	paste(collapse = "")
  write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)