Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 8bfefd7afece42806a3e35e41fabe6e2d817255b

Authored by Efrain Gonzalez 2017-06-12 14:27:42 -0400

1 parent 788834dd79

Exists in master

Update

Showing 1 changed file with 1 additions and 1 deletions Show diff stats

RPostClean.R

Diff comments View file @ 8bfefd7

1	#For Reading Raw Data from the created file	1	#For Reading Raw Data from the created file
2		2
3	#Required Libraries	3	#Required Libraries
4	library(MASS)	4	library(MASS)
5	library(dplyr)	5	library(dplyr)
6	library(tidyr)	6	library(tidyr)
7	library(readr)	7	library(readr)
8	library(stringr)	8	library(stringr)
9		9
10		10
11	#Necessary Functions	11	#Necessary Functions
12		12
13	#1# Function for discretizing the data	13	#1# Function for discretizing the data
14	dndat <- function(NDATA){	14	dndat <- function(NDATA){
15	rownd <- dim(NDATA)[1]	15	rownd <- dim(NDATA)[1]
16	colnd <- dim(NDATA)[2]	16	colnd <- dim(NDATA)[2]
17	DDATA <- matrix(0,nrow=rownd,ncol=colnd)	17	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
18	colnames(DDATA) <- colnames(NDATA)	18	colnames(DDATA) <- colnames(NDATA)
19	i = 1	19	i = 1
20	for(i in 1:rownd){	20	for(i in 1:rownd){
21	for(j in 1:colnd){	21	for(j in 1:colnd){
22	if(is.na(NDATA[i,j])==FALSE){	22	if(is.na(NDATA[i,j])==FALSE){
23		23
24	if(NDATA[i,j] < -1){	24	if(NDATA[i,j] < -1){
25	DDATA[i,j]=0L	25	DDATA[i,j]=0L
26	}	26	}
27	if(NDATA[i,j] > 1){	27	if(NDATA[i,j] > 1){
28	DDATA[i,j]=2L	28	DDATA[i,j]=2L
29	}	29	}
30	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){	30	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
31	DDATA[i,j]=1L	31	DDATA[i,j]=1L
32	}	32	}
33	} else{	33	} else{
34	DDATA[i,j] = NDATA[i,j]	34	DDATA[i,j] = NDATA[i,j]
35	}	35	}
36	j = j + 1	36	j = j + 1
37	}	37	}
38	i = i + 1	38	i = i + 1
39	}	39	}
40	DDATA	40	DDATA
41	}	41	}
42		42
43		43
44	#Bringing in the file	44	#Bringing in the file
45	rawdat <- file.choose()	45	rawdat <- file.choose()
46	RAWDAT <- rawdat %>%	46	RAWDAT <- rawdat %>%
47	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%	47	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
48	filter(.,!grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))	48	filter(.,!grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))
49	attributes(RAWDAT)$names <- RAWDAT[1,]	49	attributes(RAWDAT)$names <- RAWDAT[1,]
50		50
51	#Just the clinical data	51	#Just the clinical data
52	RAWWORD <- rawdat %>%	52	RAWWORD <- rawdat %>%
53	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%	53	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
54	filter(.,grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))	54	filter(.,grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))
55	attributes(RAWWORD)$names <- RAWDAT[1,]	55	attributes(RAWWORD)$names <- RAWDAT[1,]
56	#Add col of NAs to clinical data	56	#Add col of NAs to clinical data
57	z <- 1	57	z <- 1
58	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)	58	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
59	for(z in 1:dim(RAWWORD)[1]){	59	for(z in 1:dim(RAWWORD)[1]){
60	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))	60	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
61	z <- z + 1	61	z <- z + 1
62	}	62	}
63	colnames(naroww) <- "ROW_NAs"	63	colnames(naroww) <- "ROW_NAs"
64	RAWWORD <- bind_cols(RAWWORD,naroww)	64	RAWWORD <- bind_cols(RAWWORD,naroww)
65		65
66		66
67	##Getting back to the data	67	##Getting back to the data
68	RAWDAT2 <- RAWDAT[-1,] %>%	68	RAWDAT2 <- RAWDAT[-1,] %>%
69	dplyr::arrange(.,ID_REF)	69	dplyr::arrange(.,ID_REF)
70		70
71	##Editing the file for R processing	71	##Editing the file for R processing
72	RAWDATID <- RAWDAT2[,1] %>%	72	RAWDATID <- RAWDAT2[,1] %>%
73	as.matrix(.)	73	as.matrix(.)
74	RAWDATNUM <- RAWDAT2[,-1] %>%	74	RAWDATNUM <- RAWDAT2[,-1] %>%
75	mapply(.,FUN = as.numeric) %>%	75	mapply(.,FUN = as.numeric) %>%
76	t(.)	76	t(.)
77		77
78	##Consolidating genes with the same name	78	##Consolidating genes with the same name
79	tabRDATID <- table(RAWDATID)	79	tabRDATID <- table(RAWDATID)
80	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))	80	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
81	j <- 1	81	j <- 1
82	for(j in 1:length(tabRDATID)){	82	for(j in 1:length(tabRDATID)){
83	##Putting the ones without duplicates in their new homes	83	##Putting the ones without duplicates in their new homes
84	if(tabRDATID[j] == 1){	84	if(tabRDATID[j] == 1){
85	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]	85	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
86	}	86	}
87	##Averaging duplicates and putting them in their new homes	87	##Averaging duplicates and putting them in their new homes
88	if(tabRDATID[j] > 1){	88	if(tabRDATID[j] > 1){
89	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)	89	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
90	}	90	}
91	j <- j + 1	91	j <- j + 1
92	}	92	}
93		93
94		94
95	#Scaling the Data	95	#Scaling the Data
96	scrawdat <- NuRDATN%>%	96	scrawdat <- NuRDATN%>%
97	scale()	97	scale()
98	attr(scrawdat,"scaled:center") <- NULL	98	attr(scrawdat,"scaled:center") <- NULL
99	attr(scrawdat,"scaled:scale") <- NULL	99	attr(scrawdat,"scaled:scale") <- NULL
100	colnames(scrawdat) <- rownames(tabRDATID)	100	colnames(scrawdat) <- rownames(tabRDATID)
101		101
102		102
103	#Discretized the Data	103	#Discretized the Data
104	dialzdat <- scrawdat %>%	104	dialzdat <- scrawdat %>%
105	dndat(.) %>%	105	dndat(.) %>%
106	t()%>%	106	t()%>%
107	as.data.frame(.)	107	as.data.frame(.)
108	colnames(dialzdat) <- rownames(RAWDATNUM)	108	colnames(dialzdat) <- rownames(RAWDATNUM)
109		109
110	#gene names	110	#gene names
111	genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))	111	genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
112	#setting "ID_REF" as a new variable	112	#setting "ID_REF" as a new variable
113	colnames(genena) <- "ID_REF"	113	colnames(genena) <- "ID_REF"
114	rownames(dialzdat) <- NULL	114	rownames(dialzdat) <- NULL
115	dialzdat <-bind_cols(genena,dialzdat)	115	dialzdat <-bind_cols(genena,dialzdat)
116		116
117	#NAs in a column	117	#NAs in a column
118	x <- 2	118	x <- 2
119	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)	119	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
120	nacol[1,1] = "COL_NAs"	120	nacol[1,1] = "COL_NAs"
121	for(x in 2:dim(dialzdat)[2]){	121	for(x in 2:dim(dialzdat)[2]){
122	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))	122	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
123	x <- x + 1	123	x <- x + 1
124	}	124	}
125	colnames(nacol) <- colnames(dialzdat)	125	colnames(nacol) <- colnames(dialzdat)
126	dialzdat<-bind_rows(dialzdat,nacol)	126	dialzdat<-bind_rows(dialzdat,nacol)
127		127
128	#NAs in a row	128	#NAs in a row
129	y <- 1	129	y <- 1
130	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)	130	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
131	for(y in 1:dim(dialzdat)[1]){	131	for(y in 1:dim(dialzdat)[1]){
132	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))	132	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
133	y <- y + 1	133	y <- y + 1
134	}	134	}
135	colnames(narowd) <- "ROW_NAs"	135	colnames(narowd) <- "ROW_NAs"
136	dialzdat <- bind_cols(dialzdat,narowd)	136	dialzdat <- bind_cols(dialzdat,narowd)
137		137
138	#converting to character so that the clinical can be brought together with discrete data	138	#converting to character so that the clinical can be brought together with discrete data
139	k <- 2	139	k <- 2
140	for(k in 2:dim(dialzdat)[2]-1){	140	for(k in 2:dim(dialzdat)[2]-1){
141	dialzdat[,k] <- as.character(dialzdat[,k])	141	dialzdat[,k] <- as.character(dialzdat[,k])
142	k <- k + 1	142	k <- k + 1
143	}	143	}
144		144
145		145
146	#The End the full data we seem to have found Carmen	146	#The End the full data
147	Fullalzdw <- bind_rows(RAWWORD,dialzdat)	147	Fullalzdw <- bind_rows(RAWWORD,dialzdat)
148		148
149	#Create the file	149	#Create the file
150	nfnaex <- strsplit(rawdat,"[\\\|/]") %>%	150	nfnaex <- strsplit(rawdat,"[\\\|/]") %>%
151	.[[1]] %>%	151	.[[1]] %>%
152	.[length(.)] %>%	152	.[length(.)] %>%
153	gsub("\\D","",.) %>%	153	gsub("\\D","",.) %>%
154	c("GSE",.,"dscrt.txt") %>%	154	c("GSE",.,"dscrt.txt") %>%
155	paste(collapse = "")	155	paste(collapse = "")
156	write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)	156	write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
157		157
158		158