Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit eccb7a19e29c5a6300ce75a7154eac8089de2a0b

Authored by Efrain Gonzalez 2017-06-21 13:59:44 -0400

2 parents 01a09e19c4 edf2baf695

Exists in master

Merge branch 'master' of smlg.fiu.edu:efraingonzalez0/cleaning-and-fixing-data-with-r

Showing 2 changed files Show diff stats

RAutoClDs.R

Diff comments View file @ eccb7a1

1	########################################################################	1	########################################################################
2	# Don't Use This Code Just Yet #	2	# Don't Use This Code Just Yet #
3	########################################################################	3	########################################################################
4	#Efrain H. Gonzalez	4	#Efrain H. Gonzalez
5	#6/16/2017	5	#6/21/2017
6		6	options(digits = 11)
7	#Libraries required to run the code	7	#Libraries required to run the code
8	library(pryr)	8	library(pryr)
9	library(MASS)	9	library(MASS)
10	library(dplyr)	10	library(dplyr)
11	library(tidyr)	11	library(tidyr)
12	library(readr)	12	library(readr)
13	library(stringr)	13	library(stringr)
14		14
15		15
16	#Necessary Functions	16	#Necessary Functions
17	#1#Function for handling the changing of row names and column names	17	#1#Function for handling the changing of row names and column names
18	chngrownm <- function(mat){	18	chngrownm <- function(mat){
19	row <- dim(mat)[1]	19	row <- dim(mat)[1]
20	col <- dim(mat)[2]	20	col <- dim(mat)[2]
21	e <- 1	21	e <- 1
22	r <- 1	22	r <- 1
23	a <- 1	23	a <- 1
24	h <- 1	24	h <- 1
25	g <- 1	25	g <- 1
26	o <- 1	26	o <- 1
27	for(e in 1:col){	27	for(e in 1:col){
28	if("!Sample_source_name_ch1"==mat[1,e]){	28	if("!Sample_source_name_ch1"==mat[1,e]){
29	colnames(mat)[e] <- "Brain_Region"	29	colnames(mat)[e] <- "Brain_Region"
30	}	30	} else if("!Sample_title" == mat[1,e]){
31	else if("!Sample_title" == mat[1,e]){
32	colnames(mat)[e] <- "Title"	31	colnames(mat)[e] <- "Title"
33	}	32	} else if("!Sample_geo_accession" == mat[1,e]){
34	else if("!Sample_geo_accession" == mat[1,e]){
35	colnames(mat)[e] <- "ID_REF"	33	colnames(mat)[e] <- "ID_REF"
36	} else{	34	} else{
37	if(grepl("Sex\|gender\|Gender\|sex",mat[2,e])==TRUE){	35	if(grepl("Sex\|gender\|Gender\|sex",mat[2,e])==TRUE){
38	colnames(mat)[e] <- paste0("Sex",r)	36	colnames(mat)[e] <- paste0("Sex",r)
39	r = r + 1	37	r = r + 1
40	}	38	}
41	else if(grepl("postmorteminterval\|PMI\|pmi",mat[2,e])==TRUE){	39	if(grepl("postmorteminterval\|PMI\|pmi\|interval",mat[2,e])==TRUE){
42	colnames(mat)[e] <- paste0("PMI",a)	40	colnames(mat)[e] <- paste0("PMI",a)
43	a = a + 1	41	a = a + 1
44	}	42	}
45	else if(grepl("age\|Age\|AGE",mat[2,e])==TRUE){	43	if(grepl("age\|Age\|AGE",mat[2,e])==TRUE){
46	colnames(mat)[e] <- paste0("Age",h)	44	colnames(mat)[e] <- paste0("Age",h)
47	h = h + 1	45	h = h + 1
48	}	46	}
49	else if(grepl("braak\|b&b",mat[2,e])==TRUE){	47	if(grepl("braak\|b&b",mat[2,e])==TRUE){
50	colnames(mat)[e] <- paste0("Braak",g)	48	colnames(mat)[e] <- paste0("Braak",g)
51	g = g + 1	49	g = g + 1
52	}	50	}
53	else if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,e])==TRUE){	51	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,e])==TRUE){
54	colnames(mat)[e] <- paste0("Group",o)	52	colnames(mat)[e] <- paste0("Group",o)
55	o = o + 1	53	o = o + 1
56	}	54	}
57		55
58	}	56	}
59	e = e + 1	57	e = e + 1
60	}	58	}
61	mat	59	mat
62	}	60	}
63		61
64	#2#Function for reorganizing information within the columns	62	#2#Function for reorganizing information within the columns
65	cinfo <- function(mat){	63	cinfo <- function(mat){
66	col <- dim(mat)[2]	64	col <- dim(mat)[2]
67	j <-2	65	j <-2
68	for(j in 2:col){	66	for(j in 2:col){
69	if(grepl("Group",colnames(mat)[j]) == TRUE){	67	if(grepl("Group",colnames(mat)[j]) == TRUE){
70	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])	68	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
71	}	69	} else if(grepl("Age",colnames(mat)[j])==TRUE){
72	else if(grepl("Age",colnames(mat)[j])==TRUE){
73	mat[,j] <- gsub("\\D","",mat[,j])%>%	70	mat[,j] <- gsub("\\D","",mat[,j])%>%
74	as.integer()	71	as.integer()
75	}	72	} else if(grepl("Sex",colnames(mat)[j])==TRUE){
76	else if(grepl("Sex",colnames(mat)[j])==TRUE){
77	mat[,j] <- gsub(".+:\\s","",mat[,j])	73	mat[,j] <- gsub(".+:\\s","",mat[,j])
78	}	74	} else if(grepl("PMI",colnames(mat)[j])==TRUE){
79	else if(grepl("PMI",colnames(mat)[j])==TRUE){
80	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%	75	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
81	as.numeric()	76	as.numeric()
82	}	77	} else if(grepl("Braak",colnames(mat)[j])==TRUE){
83	else if(grepl("Braak",colnames(mat)[j])==TRUE){
84	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
85	as.roman()%>%	79	as.roman()%>%
86	as.integer()	80	as.integer()
87	}	81	}
88	j=j+1	82	j=j+1
89	}	83	}
90	mat	84	mat
91	}	85	}
92		86
93	#3#Function for labeling the gene IDs without names	87	#3#Function for labeling the gene IDs without names
94	NAFIXING <- function(GIDNAM){	88	NAFIXING <- function(GIDNAM){
95	row <- dim(GIDNAM)[1]	89	row <- dim(GIDNAM)[1]
96	i <- 1	90	i <- 1
97	for(i in 1:row){	91	for(i in 1:row){
98	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){	92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
99	GIDNAM[i,2] <- GIDNAM[i,1]	93	GIDNAM[i,2] <- GIDNAM[i,1]
100	}	94	}
101	i <- i + 1	95	i <- i + 1
102	}	96	}
103	GIDNAM	97	GIDNAM
104	}	98	}
105		99
106	#4#Function for changing the gene ID to gene name	100	#4#Function for changing the gene ID to gene name
107	cgeneID <- function(GeneName,DATA){	101	cgeneID <- function(GeneName,DATA){
108	colGene <- dim(GeneName)[2]	102	nj <- t(GeneName)
109	j <- 1	103	nq <- t(DATA)
110	for(j in 1:colGene){	104	colGene <- dim(nj)[2]
111	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])	105	colDATA <- dim(nq)[2]
112	if(is.na(sum(chngsreq))==FALSE){	106	j <- 1
113	if(sum(chngsreq) > 0){	107	for(j in 1:colDATA){
114	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])	108	#where is that gene id located within the GPL file
		109	chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
		110	if(is.na(sum(chngreq))==FALSE){
		111	if(sum(chngreq) > 0){
		112	nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
		113	}
115	}	114	}
		115	j <- j + 1
116	}	116	}
117	j = j+1	117	nq
118	}
119	DATA
120	}	118	}
		119	#cgeneID <- function(GeneName,DATA){
		120	# colGene <- dim(GeneName)[2]
		121	# j <- 1
		122	# for(j in 1:colGene){
		123	# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
		124	# if(is.na(sum(chngsreq))==FALSE){
		125	# if(sum(chngsreq) > 0){
		126	# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
		127	# }
		128	# }
		129	# j = j+1
		130	# }
		131	# DATA
		132	#}
121		133
122	#5#Function for adjusting the gene names	134	#5#Function for adjusting the gene names
123	gcnames <- function(DiData,usecol=1){	135	gcnames <- function(DiData,usecol=1){
124	nuruns <- dim(DiData)[2]	136	nuruns <- dim(DiData)[2]
125	i = 1	137	i = 1
126	nwnam <- rep("0",length.out=nuruns)	138	nwnam <- rep("0",length.out=nuruns)
127	for(i in 1:nuruns){	139	for(i in 1:nuruns){
128	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){	140	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
129	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])	141	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
130	} else{	142	} else{
131	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])	143	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
132	}	144	}
133		145
134	}	146	}
135	nwnam	147	nwnam
136		148
137	}	149	}
138		150
139	#6# Function for discretizing the data	151	#6# Function for discretizing the data
140	dndat <- function(NDATA){	152	dndat <- function(NDATA){
141	rownd <- dim(NDATA)[1]	153	rownd <- dim(NDATA)[1]
142	colnd <- dim(NDATA)[2]	154	colnd <- dim(NDATA)[2]
143	DDATA <- matrix(0,nrow=rownd,ncol=colnd)	155	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
144	colnames(DDATA) <- colnames(NDATA)	156	colnames(DDATA) <- colnames(NDATA)
145	i <- 1	157	i <- 1
146	for(i in 1:rownd){	158	for(i in 1:rownd){
147	j <- 1	159	j <- 1
148	for(j in 1:colnd){	160	for(j in 1:colnd){
149	if(is.na(NDATA[i,j])==FALSE){	161	if(is.na(NDATA[i,j])==FALSE){
150		162
151	if(NDATA[i,j] < -1){	163	if(NDATA[i,j] < -1){
152	DDATA[i,j]=0L	164	DDATA[i,j]=0L
153	}	165	} else if(NDATA[i,j] > 1){
154	if(NDATA[i,j] > 1){
155	DDATA[i,j]=2L	166	DDATA[i,j]=2L
156	}	167	} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
157	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158	DDATA[i,j]=1L	168	DDATA[i,j]=1L
159	}	169	}
160	} else{	170	} else{
161	DDATA[i,j] = NDATA[i,j]	171	DDATA[i,j] = NDATA[i,j]
162	}	172	}
163	j = j + 1	173	j = j + 1
164	}	174	}
165	i = i + 1	175	i = i + 1
166	}	176	}
167	DDATA	177	DDATA
168	}	178	}
169		179
170		180
171	#MajorFunction#This is the function that does everything else	181	#MajorFunction#This is the function that does everything else
172	THEFT <- function(){	182	THEFT <- function(){
173	#Set working directory based on the directory of the series matrix file Currently only works for windows	183	#Set working directory based on the directory of the series matrix file Currently only works for windows
174	wd <- getwd()	184	wd <- getwd()
175	#list.files()	185	#list.files()
176	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")	186	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
177	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)	187	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
178	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())	188	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
179		189	GSEfloc <- list.files()[GSEfileloc]
180	#ALL DATA FILES WILL BE CLEANED	190	#ALL DATA FILES WILL BE CLEANED
181	if(numDAT == 1){	191	if(numDAT == 1){
182	#indexing the data files	192	#indexing the data files
183	n <- 1	193	n <- 1
184	for(n in 1: length(GSEfileloc)){	194	for(n in 1: length(GSEfloc)){
185	alz <- list.files()[GSEfileloc[n]]	195	alz <- GSEfloc[n]
186		196
187	#Working with the wordy part of the document	197	#Working with the wordy part of the document
188	alzword <- alz %>%	198	alzword <- alz %>%
189	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%	199	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
190	filter(grepl("!Sample",X1))%>%	200	filter(grepl("!Sample",X1))%>%
191	filter(!grepl("!Sample_contact",X1))	201	filter(!grepl("!Sample_contact",X1))
192		202
193	#Getting the GPL file	203	#Getting the GPL file
194	genena <- grep("_platform_id",alzword$X1) %>%	204	genena <- grep("_platform_id",alzword$X1) %>%
195	alzword$X2[.] %>%	205	alzword$X2[.] %>%
196	str_trim(.) %>%	206	str_trim(.) %>%
197	paste0("^",.,"\\D") %>%	207	paste0("^",.,"\\D") %>%
198	grep(.,list.files()) %>%	208	grep(.,list.files()) %>%
199	list.files()[.]	209	list.files()[.]
200		210
201	#Find out if it is a soft GPL file or not	211	#Find out if it is a soft GPL file or not
202	soft <- strsplit(genena,"[\\\|/]") %>%	212	soft <- strsplit(genena,"[\\\|/]") %>%
203	.[[1]] %>%	213	.[[1]] %>%
204	.[length(.)] %>%	214	.[length(.)] %>%
205	grepl("soft",.)	215	grepl("soft",.)
206		216
207	##Changing row names and column names:	217	##Changing row names and column names:
208	ALZWORD <- t(alzword)	218	ALZWORD <- t(alzword)
209	rownames(ALZWORD)=NULL	219	rownames(ALZWORD)=NULL
210	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)	220	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
211	ALZWORD <- chngrownm(ALZWORD)[-1,]	221	ALZWORD <- chngrownm(ALZWORD)[-1,]
212	ALZWORD <- ALZWORD%>%	222	ALZWORD <- ALZWORD%>%
213	as.data.frame()%>%	223	as.data.frame()%>%
214	dplyr::select(-starts_with("col"))	224	dplyr::select(-starts_with("col"))
215		225
216	##Reorganizing information within the columns and final clinical data	226	##Reorganizing information within the columns and final clinical data
217	ALZWORDF <- cinfo(ALZWORD)	227	ALZWORDF <- cinfo(ALZWORD)
218		228
219		229
220	#Working with Actual Data part of file	230	#Working with Actual Data part of file
221	alzdat <- alz %>%	231	alzdat <- alz %>%
222	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)	232	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
223	ALZDAT <- t(alzdat[,-1])	233	ALZDAT <- t(alzdat[,-1])
224	rownames(ALZDAT)=NULL	234	rownames(ALZDAT)=NULL
225		235
226	##Is there a clean version of the GPL file available?	236	##Is there a clean version of the GPL file available?
227	gplnum <- strsplit(genena,"[\\\|/]") %>%	237	gplnum <- strsplit(genena,"[\\\|/]") %>%
228	.[[1]] %>%	238	.[[1]] %>%
229	.[length(.)] %>%	239	.[length(.)] %>%
230	gsub("\\D","",.)	240	gsub("\\D","",.)
231	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))	241	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
232	if(clfileex >= 1){	242	if(clfileex >= 1){
233	#use the clean version	243	#use the clean version
234	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%	244	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
235	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")	245	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
236		246
237	}	247	} else if(clfileex == 0){
238	else if(clfileex == 0){
239	##Lets Create a clean version	248	##Lets Create a clean version
240		249
241	##Gene ID to Gene Name	250	##Gene ID to Gene Name
242	if(soft == TRUE){	251	if(soft == TRUE){
243	#Check to see if there is already a file containing information on soft files	252	#Check to see if there is already a file containing information on soft files
244	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))	253	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
245	if(fileex == 1){	254	if(fileex == 1){
246	#Check to see if this GPL soft file has been used before	255	#Check to see if this GPL soft file has been used before
247	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	256	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
248	.$GPL_FILE_NUM%>%	257	.$GPL_FILE_NUM%>%
249	grepl(gplnum,.) %>%	258	grepl(gplnum,.) %>%
250	sum()	259	sum()
251	if(IDF == 1){	260	if(IDF == 1){
252	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	261	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
253	.$GPL_FILE_NUM%>%	262	.$GPL_FILE_NUM%>%
254	grep(gplnum,.)	263	grep(gplnum,.)
255	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	264	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
256	.$LOC_ID %>%	265	.$LOC_ID %>%
257	.[IDLOCAL]	266	.[IDLOCAL]
258	geneIDNam <- genena %>%	267	geneIDNam <- genena %>%
259	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%	268	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
260	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	269	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
261	}	270	} else if(IDF == 0){
262	else if(IDF == 0){
263	#No information on this particular GPL file	271	#No information on this particular GPL file
264	idLOCGPL <- genena %>%	272	idLOCGPL <- genena %>%
265	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	273	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
266	t(.) %>%	274	t(.) %>%
267	grep("^ID\\s*$",.) %>%	275	grep("^ID\\s*$",.) %>%
268	-1	276	-1
269	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%	277	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
270	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)	278	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
271	geneIDNam <- genena %>%	279	geneIDNam <- genena %>%
272	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	280	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
273	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	281	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
274	}	282	}
275	}	283	} else if(fileex == 0){
276	else if(fileex == 0){
277	#We must create a file that we can access for later use	284	#We must create a file that we can access for later use
278	idLOCGPL <- genena %>%	285	idLOCGPL <- genena %>%
279	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	286	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
280	t(.) %>%	287	t(.) %>%
281	grep("^ID\\s*$",.) %>%	288	grep("^ID\\s*$",.) %>%
282	-1	289	-1
283	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))	290	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
284	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")	291	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
285	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)	292	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
286	geneIDNam <- genena %>%	293	geneIDNam <- genena %>%
287	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	294	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
288	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	295	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
289	}	296	}
290	}	297	} else if(soft == FALSE){
291	else if(soft == FALSE){
292	geneIDNam <- genena %>%	298	geneIDNam <- genena %>%
293	read_delim(delim="\t",comment = "#")%>%	299	read_delim(delim="\t",comment = "#")%>%
294	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	300	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
295	}	301	}
296		302
297	##Labeling the gene IDs without names	303	##Labeling the gene IDs without names
298	geneIDNam <- NAFIXING(geneIDNam)	304	geneIDNam <- NAFIXING(geneIDNam)
299		305
300	##remove the whitespace	306	##remove the whitespace
301	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	307	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
302		308
303	##Here is the clean version	309	##Here is the clean version
304	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)	310	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
305	}	311	}
306		312
307		313
308		314
309	##Changing the gene ID to gene name	315	##Changing the gene ID to gene name
310	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))	316	ALZDAT1 <- cgeneID(geneIDNam,alzdat)
311	colnames(ALZDAT) = ALZDAT1[1,]	317	colnames(ALZDAT) = ALZDAT1[1,]
312		318
313		319
314	##Adjusting the column names aka the gene names	320	##Adjusting the column names aka the gene names
315	colnames(ALZDAT) <- gcnames(ALZDAT)	321	colnames(ALZDAT) <- gcnames(ALZDAT)
316		322
317		323
318	#Full RAW Data	324	#Full RAW Data
319	Fullalzdwr <- ALZDAT %>%	325	Fullalzdwr <- ALZDAT %>%
320	as.data.frame() %>%	326	as.data.frame() %>%
321	cbind(ALZWORDF,.)	327	cbind(ALZWORDF,.)
322		328
323	#Raw file is output	329	#Raw file is output
324	nfnaex <- strsplit(alz,"[\\]") %>%	330	nfnaex <- strsplit(alz,"[\\]") %>%
325	.[[1]] %>%	331	.[[1]] %>%
326	.[length(.)] %>%	332	.[length(.)] %>%
327	gsub("\\D","",.) %>%	333	gsub("\\D","",.) %>%
328	c("GSE",.,"aftexcel.txt") %>%	334	c("GSE",.,"aftexcel.txt") %>%
329	paste(collapse = "")	335	paste(collapse = "")
330	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")	336	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
331		337
332		338
333		339
334	#Now for the discretization part	340	#Now for the discretization part
335	##get the wordy part again	341	##get the wordy part again
336	rawword <- t(ALZWORDF)	342	rawword <- t(ALZWORDF)
337		343
338	##where is ID_REF located	344	##where is ID_REF located
339	hereim <- grep("ID_REF",rownames(rawword))	345	hereim <- grep("ID_REF",rownames(rawword))
340		346
341	##Subject Names GSM...	347	##Subject Names GSM...
342	subjnam <- rawword[hereim,]	348	subjnam <- rawword[hereim,]
343		349
344	##Getting the names for the rows	350	##Getting the names for the rows
345	namedarows <- rownames(rawword)[-hereim] %>%	351	namedarows <- rownames(rawword)[-hereim] %>%
346	as.data.frame()	352	as.data.frame()
347	RAWWORD <- rawword[-hereim,] %>%	353	RAWWORD <- rawword[-hereim,] %>%
348	as.data.frame() %>%	354	as.data.frame() %>%
349	bind_cols(namedarows,.)	355	bind_cols(namedarows,.)
350	z <- 1	356	z <- 1
351	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)	357	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
352	for(z in 1:dim(RAWWORD)[1]){	358	for(z in 1:dim(RAWWORD)[1]){
353	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))	359	if(sum(is.na(RAWWORD[z,])) > 0){
354	z <- z + 1	360	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
355	}	361	}
		362	if(length(grep("NA",RAWWORD[z,])) > 0){
		363	naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
		364	}
		365	z <- z + 1
		366	}
356		367
357	colnames(naroww) <- "ROW_NAs"	368	colnames(naroww) <- "ROW_NAs"
358	RAWWORD <- bind_cols(RAWWORD,naroww)	369	RAWWORD <- bind_cols(RAWWORD,naroww)
359		370
360		371
361	roALZna <- t(ALZDAT) %>%	372	roALZna <- t(ALZDAT) %>%
362	rownames(.) %>%	373	rownames(.) %>%
363	as.data.frame(.)	374	as.data.frame(.)
364	colnames(roALZna) <- "ID_REF"	375	colnames(roALZna) <- "ID_REF"
365		376
366	RAWDAT <- t(ALZDAT) %>%	377	RAWDAT <- t(ALZDAT) %>%
367	as.data.frame(.)	378	as.data.frame(.)
368	colnames(RAWDAT) <- NULL	379	colnames(RAWDAT) <- NULL
369	rownames(RAWDAT) <- NULL	380	rownames(RAWDAT) <- NULL
370		381
371	RAWDAT2 <- RAWDAT %>%	382	RAWDAT2 <- RAWDAT %>%
372	cbind(roALZna,.) %>%	383	cbind(roALZna,.) %>%
373	dplyr::arrange(.,ID_REF)	384	dplyr::arrange(.,ID_REF)
374		385
375	##Editing the file for R processing	386	##Editing the file for R processing
376	RAWDATID <- RAWDAT2[,1] %>%	387	RAWDATID <- RAWDAT2[,1] %>%
377	as.matrix(.)	388	as.matrix(.)
378		389
379	RAWDATNUM <- RAWDAT2[,-1] %>%	390	RAWDATNUM <- RAWDAT2[,-1] %>%
380	mapply(.,FUN = as.numeric) %>%	391	mapply(.,FUN = as.numeric) %>%
381	t(.)	392	t(.)
382		393
383	##Consolidating genes with the same name	394	##Consolidating genes with the same name
384	###create empty matrix of size equal to tabRDATID	395	###create empty matrix of size equal to tabRDATID
385	tabRDATID <- table(RAWDATID)	396	tabRDATID <- table(RAWDATID)
386	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))	397	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
387	j <- 1	398	j <- 1
388	for(j in 1:length(tabRDATID)){	399	for(j in 1:length(tabRDATID)){
389	##Putting the ones without duplicates in their new homes	400	##Putting the ones without duplicates in their new homes
390	if(tabRDATID[j] == 1){	401	if(tabRDATID[j] == 1){
391	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]	402	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
392	}	403	} else if(tabRDATID[j] > 1){
393	##Averaging duplicates and putting them in their new homes	404	##Averaging duplicates and putting them in their new homes
394	else if(tabRDATID[j] > 1){
395	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)	405	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
396	}	406	}
397	j <- j + 1	407	j <- j + 1
398	}	408	}
399		409
400	##Scaling the Data	410	##Scaling the Data
401	scrawdat <- NuRDATN%>%	411	scrawdat <- NuRDATN%>%
402	scale()	412	scale()
403	attr(scrawdat,"scaled:center") <- NULL	413	attr(scrawdat,"scaled:center") <- NULL
404	attr(scrawdat,"scaled:scale") <- NULL	414	attr(scrawdat,"scaled:scale") <- NULL
405	colnames(scrawdat) <- rownames(tabRDATID)	415	colnames(scrawdat) <- rownames(tabRDATID)
406		416
407	##Discretized the Data	417	##Discretized the Data
408	dialzdat <- scrawdat %>%	418	dialzdat <- scrawdat %>%
409	dndat(.) %>%	419	dndat(.) %>%
410	t()%>%	420	t()%>%
411	as.data.frame(.)	421	as.data.frame(.)
412	colnames(dialzdat) <- rownames(RAWDATNUM)	422	colnames(dialzdat) <- rownames(RAWDATNUM)
413		423
414	##setting "ID_REF" as a new variable	424	##setting "ID_REF" as a new variable
415	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))	425	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
416	colnames(geneNAM) <- "ID_REF"	426	colnames(geneNAM) <- "ID_REF"
417	rownames(dialzdat) <- NULL	427	rownames(dialzdat) <- NULL
418	dialzdat <-bind_cols(geneNAM,dialzdat)	428	dialzdat <-bind_cols(geneNAM,dialzdat)
419		429
420	##NAs in a column	430	##NAs in a column
421	x <- 2	431	x <- 2
422	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)	432	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
423	nacol[1,1] = "COL_NAs"	433	nacol[1,1] = "COL_NAs"
424	for(x in 2:dim(dialzdat)[2]){	434	for(x in 2:dim(dialzdat)[2]){
425	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))	435	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
426	x <- x + 1	436	x <- x + 1
427	}	437	}
428	colnames(nacol) <- colnames(dialzdat)	438	colnames(nacol) <- colnames(dialzdat)
429	dialzdat <- bind_rows(dialzdat,nacol)	439	dialzdat <- bind_rows(dialzdat,nacol)
430		440
431	##NAs in a row	441	##NAs in a row
432	y <- 1	442	y <- 1
433	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)	443	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
434	for(y in 1:dim(dialzdat)[1]){	444	for(y in 1:dim(dialzdat)[1]){
435	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))	445	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
436	y <- y + 1	446	y <- y + 1
437	}	447	}
438	colnames(narowd) <- "ROW_NAs"	448	colnames(narowd) <- "ROW_NAs"
439	dialzdat <- bind_cols(dialzdat,narowd)	449	dialzdat <- bind_cols(dialzdat,narowd)
440	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam	450	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
441	colnames(RAWWORD) <- colnames(dialzdat)	451	colnames(RAWWORD) <- colnames(dialzdat)
442	##converting to character so that the clinical can be brought together with discrete data	452	##converting to character so that the clinical can be brought together with discrete data
443	k <- 2	453	k <- 2
444	for(k in 2:dim(dialzdat)[2]-1){	454	for(k in 2:dim(dialzdat)[2]-1){
445	dialzdat[,k] <- as.character(dialzdat[,k])	455	dialzdat[,k] <- as.character(dialzdat[,k])
446	k <- k + 1	456	k <- k + 1
447	}	457	}
448	#The End the full data	458	#The End the full data
449	Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)	459	Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
450		460
451	#Produces Discrete file	461	#Produces Discrete file
452	nfnaex2 <- strsplit(alz,"[\\\|/]") %>%	462	nfnaex2 <- strsplit(alz,"[\\\|/]") %>%
453	.[[1]] %>%	463	.[[1]] %>%
454	.[length(.)] %>%	464	.[length(.)] %>%
455	gsub("\\D","",.) %>%	465	gsub("\\D","",.) %>%
456	c("GSE",.,"dscrt.txt") %>%	466	c("GSE",.,"dscrt.txt") %>%
457	paste(collapse = "")	467	paste(collapse = "")
458	write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)	468	write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
459	n <- n +1	469	n <- n +1
460	}	470	}
461	}	471	} else if(numDAT == 2){
462
463	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN	472	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
464	else if(numDAT == 2){	473
465	#All the files you want to analyze	474	#All the files you want to analyze
466	ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")	475	ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
467	if(length(ANDIS) == 0){	476	if(length(ANDIS) == 0){
468	#Spit out a warning	477	#Spit out a warning
469	warning("You did not select any files and so no cleaning will be performed")	478	warning("You did not select any files and so no cleaning will be performed")
470	} else{	479	} else{
471	#indexing the data files	480	#indexing the data files
472	n <- 1	481	n <- 1
473	for(n in 1: length(ANDIS)){	482	for(n in 1: length(ANDIS)){
474	alz <- ANDIS[n]	483	alz <- ANDIS[n]
475		484
476	#Working with the wordy part of the document	485	#Working with the wordy part of the document
477	alzword <- alz %>%	486	alzword <- alz %>%
478	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%	487	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
479	filter(grepl("!Sample",X1))%>%	488	filter(grepl("!Sample",X1))%>%
480	filter(!grepl("!Sample_contact",X1))	489	filter(!grepl("!Sample_contact",X1))
481		490
482	#Getting the GPL file	491	#Getting the GPL file
483	genena <- grep("_platform_id",alzword$X1) %>%	492	genena <- grep("_platform_id",alzword$X1) %>%
484	alzword$X2[.] %>%	493	alzword$X2[.] %>%
485	str_trim(.) %>%	494	str_trim(.) %>%
486	paste0("^",.,"\\D") %>%	495	paste0("^",.,"\\D") %>%
487	grep(.,list.files()) %>%	496	grep(.,list.files()) %>%
488	list.files()[.]	497	list.files()[.]
489		498
490	#Find out if it is a soft GPL file or not	499	#Find out if it is a soft GPL file or not
491	soft <- strsplit(genena,"[\\\|/]") %>%	500	soft <- strsplit(genena,"[\\\|/]") %>%
492	.[[1]] %>%	501	.[[1]] %>%
493	.[length(.)] %>%	502	.[length(.)] %>%
494	grepl("soft",.)	503	grepl("soft",.)
495		504
496	##Changing row names and column names:	505	##Changing row names and column names:
497	ALZWORD <- t(alzword)	506	ALZWORD <- t(alzword)
498	rownames(ALZWORD)=NULL	507	rownames(ALZWORD)=NULL
499	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)	508	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
500	ALZWORD <- chngrownm(ALZWORD)[-1,]	509	ALZWORD <- chngrownm(ALZWORD)[-1,]
501	ALZWORD <- ALZWORD%>%	510	ALZWORD <- ALZWORD%>%
502	as.data.frame()%>%	511	as.data.frame()%>%
503	dplyr::select(-starts_with("col"))	512	dplyr::select(-starts_with("col"))
504		513
505	##Reorganizing information within the columns and final clinical data	514	##Reorganizing information within the columns and final clinical data
506	ALZWORDF <- cinfo(ALZWORD)	515	ALZWORDF <- cinfo(ALZWORD)
507		516
508		517
509	#Working with Actual Data part of file	518	#Working with Actual Data part of file
510	alzdat <- alz %>%	519	alzdat <- alz %>%
511	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)	520	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
512	ALZDAT <- t(alzdat[,-1])	521	ALZDAT <- t(alzdat[,-1])
513	rownames(ALZDAT)=NULL	522	rownames(ALZDAT)=NULL
514		523
515	##Is there a clean version of the GPL file available?	524	##Is there a clean version of the GPL file available?
516	gplnum <- strsplit(genena,"[\\\|/]") %>%	525	gplnum <- strsplit(genena,"[\\\|/]") %>%
517	.[[1]] %>%	526	.[[1]] %>%
518	.[length(.)] %>%	527	.[length(.)] %>%
519	gsub("\\D","",.)	528	gsub("\\D","",.)
520	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))	529	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
521	if(clfileex >= 1){	530	if(clfileex >= 1){
522	#use the clean version	531	#use the clean version
523	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%	532	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
524	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")	533	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
525		534
526	}	535	} else if(clfileex == 0){
527	else if(clfileex == 0){
528	##Lets Create a clean version	536	##Lets Create a clean version
529		537
530	##Gene ID to Gene Name	538	##Gene ID to Gene Name
531	if(soft == TRUE){	539	if(soft == TRUE){
532	#Check to see if there is already a file containing information on soft files	540	#Check to see if there is already a file containing information on soft files
533	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))	541	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
534	if(fileex == 1){	542	if(fileex == 1){
535	#Check to see if this GPL soft file has been used before	543	#Check to see if this GPL soft file has been used before
536	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	544	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
537	.$GPL_FILE_NUM%>%	545	.$GPL_FILE_NUM%>%
538	grepl(gplnum,.) %>%	546	grepl(gplnum,.) %>%
539	sum()	547	sum()
540	if(IDF == 1){	548	if(IDF == 1){
541	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	549	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
542	.$GPL_FILE_NUM%>%	550	.$GPL_FILE_NUM%>%
543	grep(gplnum,.)	551	grep(gplnum,.)
544	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	552	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
545	.$LOC_ID %>%	553	.$LOC_ID %>%
546	.[IDLOCAL]	554	.[IDLOCAL]
547	geneIDNam <- genena %>%	555	geneIDNam <- genena %>%
548	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%	556	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
549	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	557	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
550	}	558	} else if(IDF == 0){
551	else if(IDF == 0){
552	#No information on this particular GPL file	559	#No information on this particular GPL file
553	idLOCGPL <- genena %>%	560	idLOCGPL <- genena %>%
554	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	561	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
555	t(.) %>%	562	t(.) %>%
556	grep("^ID\\s*$",.) %>%	563	grep("^ID\\s*$",.) %>%
557	-1	564	-1
558	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%	565	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
559	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)	566	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
560	geneIDNam <- genena %>%	567	geneIDNam <- genena %>%
561	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	568	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
562	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	569	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
563	}	570	}
564	}	571	} else if(fileex == 0){
565	else if(fileex == 0){
566	#We must create a file that we can access for later use	572	#We must create a file that we can access for later use
567	idLOCGPL <- genena %>%	573	idLOCGPL <- genena %>%
568	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	574	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
569	t(.) %>%	575	t(.) %>%
570	grep("^ID\\s*$",.) %>%	576	grep("^ID\\s*$",.) %>%
571	-1	577	-1
572	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))	578	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
573	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")	579	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
574	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)	580	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
575	geneIDNam <- genena %>%	581	geneIDNam <- genena %>%
576	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	582	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
577	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	583	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
578	}	584	}
579	}	585	} else if(soft == FALSE){
580	else if(soft == FALSE){
581	geneIDNam <- genena %>%	586	geneIDNam <- genena %>%
582	read_delim(delim="\t",comment = "#")%>%	587	read_delim(delim="\t",comment = "#")%>%
583	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	588	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
584	}	589	}
585		590
586	##Labeling the gene IDs without names	591	##Labeling the gene IDs without names
587	geneIDNam <- NAFIXING(geneIDNam)	592	geneIDNam <- NAFIXING(geneIDNam)
588		593
589	##remove the whitespace	594	##remove the whitespace
590	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	595	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
591		596
592	##Here is the clean version	597	##Here is the clean version
593	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)	598	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
594	}	599	}
595		600
596		601
597		602
598	##Changing the gene ID to gene name	603	##Changing the gene ID to gene name
599	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))	604	ALZDAT1 <- cgeneID(geneIDNam,alzdat)
600	colnames(ALZDAT) = ALZDAT1[1,]	605	colnames(ALZDAT) = ALZDAT1[1,]
601		606
602		607
603	##Adjusting the column names aka the gene names	608	##Adjusting the column names aka the gene names
604	colnames(ALZDAT) <- gcnames(ALZDAT)	609	colnames(ALZDAT) <- gcnames(ALZDAT)
605		610
606		611
607	#Full RAW Data	612	#Full RAW Data
608	Fullalzdwr <- ALZDAT %>%	613	Fullalzdwr <- ALZDAT %>%
609	as.data.frame() %>%	614	as.data.frame() %>%
610	cbind(ALZWORDF,.)	615	cbind(ALZWORDF,.)
611		616
612	#Raw file is output	617	#Raw file is output
613	nfnaex <- strsplit(alz,"[\\]") %>%	618	nfnaex <- strsplit(alz,"[\\]") %>%
614	.[[1]] %>%	619	.[[1]] %>%
615	.[length(.)] %>%	620	.[length(.)] %>%
616	gsub("\\D","",.) %>%	621	gsub("\\D","",.) %>%
617	c("GSE",.,"aftexcel.txt") %>%	622	c("GSE",.,"aftexcel.txt") %>%
618	paste(collapse = "")	623	paste(collapse = "")
619	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")	624	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
620		625
621		626
622		627
623	#Now for the discretization part	628	#Now for the discretization part
624	##get the wordy part again	629	##get the wordy part again
625	rawword <- t(ALZWORDF)	630	rawword <- t(ALZWORDF)
626		631
627	##where is ID_REF located	632	##where is ID_REF located
628	hereim <- grep("ID_REF",rownames(rawword))	633	hereim <- grep("ID_REF",rownames(rawword))
629		634
630	##Subject Names GSM...	635	##Subject Names GSM...
631	subjnam <- rawword[hereim,]	636	subjnam <- rawword[hereim,]
632		637
633	##Getting the names for the rows	638	##Getting the names for the rows
634	namedarows <- rownames(rawword)[-hereim] %>%	639	namedarows <- rownames(rawword)[-hereim] %>%
635	as.data.frame()	640	as.data.frame()
636	RAWWORD <- rawword[-hereim,] %>%	641	RAWWORD <- rawword[-hereim,] %>%
637	as.data.frame() %>%	642	as.data.frame() %>%
638	bind_cols(namedarows,.)	643	bind_cols(namedarows,.)
639	z <- 1	644	z <- 1
640	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)	645	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
641	for(z in 1:dim(RAWWORD)[1]){	646	for(z in 1:dim(RAWWORD)[1]){
642	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))	647	if(sum(is.na(RAWWORD[z,])) > 0){
643	z <- z + 1	648	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
644	}	649	}
		650	if(length(grep("NA",RAWWORD[z,])) > 0){
		651	naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
		652	}
		653	z <- z + 1
		654	}
645		655
646	colnames(naroww) <- "ROW_NAs"	656	colnames(naroww) <- "ROW_NAs"
647	RAWWORD <- bind_cols(RAWWORD,naroww)	657	RAWWORD <- bind_cols(RAWWORD,naroww)
648		658
649		659
650	roALZna <- t(ALZDAT) %>%	660	roALZna <- t(ALZDAT) %>%
651	rownames(.) %>%	661	rownames(.) %>%
652	as.data.frame(.)	662	as.data.frame(.)
653	colnames(roALZna) <- "ID_REF"	663	colnames(roALZna) <- "ID_REF"
654		664
655	RAWDAT <- t(ALZDAT) %>%	665	RAWDAT <- t(ALZDAT) %>%
656	as.data.frame(.)	666	as.data.frame(.)
657	colnames(RAWDAT) <- NULL	667	colnames(RAWDAT) <- NULL
658	rownames(RAWDAT) <- NULL	668	rownames(RAWDAT) <- NULL
659		669
660	RAWDAT2 <- RAWDAT %>%	670	RAWDAT2 <- RAWDAT %>%
661	cbind(roALZna,.) %>%	671	cbind(roALZna,.) %>%
662	dplyr::arrange(.,ID_REF)	672	dplyr::arrange(.,ID_REF)
663		673
664	##Editing the file for R processing	674	##Editing the file for R processing
665	RAWDATID <- RAWDAT2[,1] %>%	675	RAWDATID <- RAWDAT2[,1] %>%
666	as.matrix(.)	676	as.matrix(.)
667		677
668	RAWDATNUM <- RAWDAT2[,-1] %>%	678	RAWDATNUM <- RAWDAT2[,-1] %>%
669	mapply(.,FUN = as.numeric) %>%	679	mapply(.,FUN = as.numeric) %>%
670	t(.)	680	t(.)
671		681
672	##Consolidating genes with the same name	682	##Consolidating genes with the same name
673	###create empty matrix of size equal to tabRDATID	683	###create empty matrix of size equal to tabRDATID
674	tabRDATID <- table(RAWDATID)	684	tabRDATID <- table(RAWDATID)
675	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))	685	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
676	j <- 1	686	j <- 1
677	for(j in 1:length(tabRDATID)){	687	for(j in 1:length(tabRDATID)){
678	##Putting the ones without duplicates in their new homes	688	##Putting the ones without duplicates in their new homes
679	if(tabRDATID[j] == 1){	689	if(tabRDATID[j] == 1){
680	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]	690	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
681	}	691	} else if(tabRDATID[j] > 1){
682	##Averaging duplicates and putting them in their new homes	692	##Averaging duplicates and putting them in their new homes
683	else if(tabRDATID[j] > 1){
684	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)	693	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
685	}	694	}
686	j <- j + 1	695	j <- j + 1
687	}	696	}
688		697
689	##Scaling the Data	698	##Scaling the Data
690	scrawdat <- NuRDATN%>%	699	scrawdat <- NuRDATN%>%
691	scale()	700	scale()
692	attr(scrawdat,"scaled:center") <- NULL	701	attr(scrawdat,"scaled:center") <- NULL
693	attr(scrawdat,"scaled:scale") <- NULL	702	attr(scrawdat,"scaled:scale") <- NULL
694	colnames(scrawdat) <- rownames(tabRDATID)	703	colnames(scrawdat) <- rownames(tabRDATID)
695		704
696	##Discretized the Data	705	##Discretized the Data
697	dialzdat <- scrawdat %>%	706	dialzdat <- scrawdat %>%
698	dndat(.) %>%	707	dndat(.) %>%
699	t()%>%	708	t()%>%
700	as.data.frame(.)	709	as.data.frame(.)
701	colnames(dialzdat) <- rownames(RAWDATNUM)	710	colnames(dialzdat) <- rownames(RAWDATNUM)
702		711
703	##setting "ID_REF" as a new variable	712	##setting "ID_REF" as a new variable
704	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))	713	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
705	colnames(geneNAM) <- "ID_REF"	714	colnames(geneNAM) <- "ID_REF"
706	rownames(dialzdat) <- NULL	715	rownames(dialzdat) <- NULL
707	dialzdat <-bind_cols(geneNAM,dialzdat)	716	dialzdat <-bind_cols(geneNAM,dialzdat)
708		717
709	##NAs in a column	718	##NAs in a column
710	x <- 2	719	x <- 2
711	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)	720	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
712	nacol[1,1] = "COL_NAs"	721	nacol[1,1] = "COL_NAs"
713	for(x in 2:dim(dialzdat)[2]){	722	for(x in 2:dim(dialzdat)[2]){
714	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))	723	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
715	x <- x + 1	724	x <- x + 1
716	}	725	}
717	colnames(nacol) <- colnames(dialzdat)	726	colnames(nacol) <- colnames(dialzdat)
718	dialzdat <- bind_rows(dialzdat,nacol)	727	dialzdat <- bind_rows(dialzdat,nacol)
719		728
720	##NAs in a row	729	##NAs in a row
721	y <- 1	730	y <- 1
722	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)	731	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
723	for(y in 1:dim(dialzdat)[1]){	732	for(y in 1:dim(dialzdat)[1]){
724	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))	733	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
725	y <- y + 1	734	y <- y + 1
726	}	735	}
727	colnames(narowd) <- "ROW_NAs"	736	colnames(narowd) <- "ROW_NAs"
728	dialzdat <- bind_cols(dialzdat,narowd)	737	dialzdat <- bind_cols(dialzdat,narowd)
729	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam	738	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
730	colnames(RAWWORD) <- colnames(dialzdat)	739	colnames(RAWWORD) <- colnames(dialzdat)
731	##converting to character so that the clinical can be brought together with discrete data	740	##converting to character so that the clinical can be brought together with discrete data
732	k <- 2	741	k <- 2
733	for(k in 2:dim(dialzdat)[2]-1){	742	for(k in 2:dim(dialzdat)[2]-1){
734	dialzdat[,k] <- as.character(dialzdat[,k])	743	dialzdat[,k] <- as.character(dialzdat[,k])
735	k <- k + 1	744	k <- k + 1

RCleanDscret.R

Diff comments View file @ eccb7a1

1	##Posted 6/15/2017	1	##Posted 6/15/2017
2		2	options(digits = 11)
3		3
4	#Libraries required to run the code	4	#Libraries required to run the code
5	library(pryr)	5	library(pryr)
6	library(MASS)	6	library(MASS)
7	library(dplyr)	7	library(dplyr)
8	library(tidyr)	8	library(tidyr)
9	library(readr)	9	library(readr)
10	library(stringr)	10	library(stringr)
11		11
12		12
13	#Necessary Functions	13	#Necessary Functions
14	#1#Function for handling the changing of row names and column names	14	#1#Function for handling the changing of row names and column names
15	chngrownm <- function(mat){	15	chngrownm <- function(mat){
16	row <- dim(mat)[1]	16	row <- dim(mat)[1]
17	col <- dim(mat)[2]	17	col <- dim(mat)[2]
18	j <- 1	18	j <- 1
19	x <- 1	19	x <- 1
20	p <- 1	20	p <- 1
21	a <- 1	21	a <- 1
22	b <- 1	22	b <- 1
23	g <- 1	23	g <- 1
24	for(j in 1:col){	24	for(j in 1:col){
25	if("!Sample_source_name_ch1"==mat[1,j]){	25	if("!Sample_source_name_ch1"==mat[1,j]){
26	colnames(mat)[j] <- "Brain_Region"	26	colnames(mat)[j] <- "Brain_Region"
27	}	27	} else if("!Sample_title" == mat[1,j]){
28	if("!Sample_title" == mat[1,j]){
29	colnames(mat)[j] <- "Title"	28	colnames(mat)[j] <- "Title"
30	}	29	} else if("!Sample_geo_accession" == mat[1,j]){
31	if("!Sample_geo_accession" == mat[1,j]){
32	colnames(mat)[j] <- "ID_REF"	30	colnames(mat)[j] <- "ID_REF"
33	} else{	31	} else{
34	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){	32	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
35	colnames(mat)[j] <- paste0("Sex",x)	33	colnames(mat)[j] <- paste0("Sex",x)
36	x = x + 1	34	x = x + 1
37	}	35	}
38	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){	36	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
39	colnames(mat)[j] <- paste0("PMI",p)	37	colnames(mat)[j] <- paste0("PMI",p)
40	p = p + 1	38	p = p + 1
41	}	39	}
42	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){	40	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
43	colnames(mat)[j] <- paste0("Age",a)	41	colnames(mat)[j] <- paste0("Age",a)
44	a = a + 1	42	a = a + 1
45	}	43	}
46	if(grepl("braak\|b&b",mat[2,j])==TRUE){	44	if(grepl("braak\|b&b",mat[2,j])==TRUE){
47	colnames(mat)[j] <- paste0("Braak",b)	45	colnames(mat)[j] <- paste0("Braak",b)
48	b = b + 1	46	b = b + 1
49	}	47	}
50	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,j])==TRUE){	48	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,j])==TRUE){
51	colnames(mat)[j] <- paste0("Group",g)	49	colnames(mat)[j] <- paste0("Group",g)
52	g = g + 1	50	g = g + 1
53	}	51	}
54		52
55	}	53	}
56	j = j + 1	54	j = j + 1
57	}	55	}
58	mat	56	mat
59	}	57	}
60		58
61	#2#Function for reorganizing information within the columns	59	#2#Function for reorganizing information within the columns
62	cinfo <- function(mat){	60	cinfo <- function(mat){
63	col <- dim(mat)[2]	61	col <- dim(mat)[2]
64	j <-2	62	j <-2
65	for(j in 2:col){	63	for(j in 2:col){
66	if(grepl("Group",colnames(mat)[j]) == TRUE){	64	if(grepl("Group",colnames(mat)[j]) == TRUE){
67	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])	65	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
68	}	66	}
69	if(grepl("Age",colnames(mat)[j])==TRUE){	67	if(grepl("Age",colnames(mat)[j])==TRUE){
70	mat[,j] <- gsub("\\D","",mat[,j])%>%	68	mat[,j] <- gsub("\\D","",mat[,j])%>%
71	as.integer()	69	as.integer()
72	}	70	}
73	if(grepl("Sex",colnames(mat)[j])==TRUE){	71	if(grepl("Sex",colnames(mat)[j])==TRUE){
74	mat[,j] <- gsub(".+:\\s","",mat[,j])	72	mat[,j] <- gsub(".+:\\s","",mat[,j])
75	}	73	}
76	if(grepl("PMI",colnames(mat)[j])==TRUE){	74	if(grepl("PMI",colnames(mat)[j])==TRUE){
77	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%	75	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
78	as.numeric()	76	as.numeric()
79	}	77	}
80	if(grepl("Braak",colnames(mat)[j])==TRUE){	78	if(grepl("Braak",colnames(mat)[j])==TRUE){
81	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%	79	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
82	as.roman()%>%	80	as.roman()%>%
83	as.integer()	81	as.integer()
84	}	82	}
85	j=j+1	83	j=j+1
86	}	84	}
87	mat	85	mat
88	}	86	}
89		87
90	#3#Function for labeling the gene IDs without names	88	#3#Function for labeling the gene IDs without names
91	NAFIXING <- function(GIDNAM){	89	NAFIXING <- function(GIDNAM){
92	row <- dim(GIDNAM)[1]	90	row <- dim(GIDNAM)[1]
93	i <- 1	91	i <- 1
94	for(i in 1:row){	92	for(i in 1:row){
95	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){	93	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
96	GIDNAM[i,2] <- GIDNAM[i,1]	94	GIDNAM[i,2] <- GIDNAM[i,1]
97	}	95	}
98	i <- i + 1	96	i <- i + 1
99	}	97	}
100	GIDNAM	98	GIDNAM
101	}	99	}
102		100
103	#4#Function for changing the gene ID to gene name	101	#4#Function for changing the gene ID to gene name
104	cgeneID <- function(GeneName,DATA){	102	cgeneID <- function(GeneName,DATA){
105	colGene <- dim(GeneName)[2]	103	nj <- t(GeneName)
106	j <- 1	104	nq <- t(DATA)
107	for(j in 1:colGene){	105	colGene <- dim(nj)[2]
108	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])	106	colDATA <- dim(nq)[2]
109	if(is.na(sum(chngsreq))==FALSE){	107	j <- 1
110	if(sum(chngsreq) > 0){	108	for(j in 1:colDATA){
111	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])	109	#where is that gene id located within the GPL file
		110	chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
		111	if(is.na(sum(chngreq))==FALSE){
		112	if(sum(chngreq) > 0){
		113	nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
		114	}
112	}	115	}
		116	j <- j + 1
113	}	117	}
114	#if(sum(chngsreq) > 0){	118	nq
115	##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
116	#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
117	#}
118	j = j+1
119	}
120	DATA
121	}	119	}
		120	#cgeneID <- function(GeneName,DATA){
		121	# colGene <- dim(GeneName)[2]
		122	# j <- 1
		123	# for(j in 1:colGene){
		124	# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
		125	# if(is.na(sum(chngsreq))==FALSE){
		126	# if(sum(chngsreq) > 0){
		127	# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
		128	# }
		129	# }
		130	# #if(sum(chngsreq) > 0){
		131	# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
		132	# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
		133	# #}
		134	# j = j+1
		135	# }
		136	# DATA
		137	#}
122		138
123	#5#Function for adjusting the gene names	139	#5#Function for adjusting the gene names
124	gcnames <- function(DiData,usecol=1){	140	gcnames <- function(DiData,usecol=1){
125	nuruns <- dim(DiData)[2]	141	nuruns <- dim(DiData)[2]
126	i = 1	142	i = 1
127	nwnam <- rep("0",length.out=nuruns)	143	nwnam <- rep("0",length.out=nuruns)
128	for(i in 1:nuruns){	144	for(i in 1:nuruns){
129	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){	145	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
130	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])	146	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
131	} else{	147	} else{
132	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])	148	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
133	}	149	}
134		150
135	}	151	}
136	nwnam	152	nwnam
137		153
138	}	154	}
139		155
140	#6# Function for discretizing the data	156	#6# Function for discretizing the data
141	dndat <- function(NDATA){	157	dndat <- function(NDATA){
142	rownd <- dim(NDATA)[1]	158	rownd <- dim(NDATA)[1]
143	colnd <- dim(NDATA)[2]	159	colnd <- dim(NDATA)[2]
144	DDATA <- matrix(0,nrow=rownd,ncol=colnd)	160	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
145	colnames(DDATA) <- colnames(NDATA)	161	colnames(DDATA) <- colnames(NDATA)
146	i <- 1	162	i <- 1
147	for(i in 1:rownd){	163	for(i in 1:rownd){
148	j <- 1	164	j <- 1
149	for(j in 1:colnd){	165	for(j in 1:colnd){
150	if(is.na(NDATA[i,j])==FALSE){	166	if(is.na(NDATA[i,j])==FALSE){
151		167
152	if(NDATA[i,j] < -1){	168	if(NDATA[i,j] < -1){
153	DDATA[i,j]=0L	169	DDATA[i,j]=0L
154	}	170	} else if(NDATA[i,j] > 1){
155	if(NDATA[i,j] > 1){
156	DDATA[i,j]=2L	171	DDATA[i,j]=2L
157	}	172	} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
158	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
159	DDATA[i,j]=1L	173	DDATA[i,j]=1L
160	}	174	}
161	} else{	175	} else{
162	DDATA[i,j] = NDATA[i,j]	176	DDATA[i,j] = NDATA[i,j]
163	}	177	}
164	j = j + 1	178	j = j + 1
165	}	179	}
166	i = i + 1	180	i = i + 1
167	}	181	}
168	DDATA	182	DDATA
169	}	183	}
170		184
171		185
172	#The Rest of this code will be used every time you want to change a data set	186	#The Rest of this code will be used every time you want to change a data set
173		187
174	#Getting the series matrix file	188	#Getting the series matrix file
175	print("Choose the series matrix file that you want to Analyze")	189	print("Choose the series matrix file that you want to Analyze")
176	alz <- file.choose()	190	alz <- file.choose()
177		191
178	#Getting the GPL file	192	#Getting the GPL file
179	print("Choose the GPL file that correlates with the above series matrix file")	193	print("Choose the GPL file that correlates with the above series matrix file")
180	genena <- file.choose()	194	genena <- file.choose()
181		195
182		196
183	#Find out if it is a soft GPL file or not	197	#Find out if it is a soft GPL file or not
184	soft <- strsplit(genena,"[\\\|/]") %>%	198	soft <- strsplit(genena,"[\\\|/]") %>%
185	.[[1]] %>%	199	.[[1]] %>%
186	.[length(.)] %>%	200	.[length(.)] %>%
187	grepl("soft\|annot",.)	201	grepl("soft\|annot",.)
188		202
189	#Working with the wordy part of the document	203	#Working with the wordy part of the document
190	alzword <- alz %>%	204	alzword <- alz %>%
191	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%	205	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
192	filter(grepl("!Sample",X1))%>%	206	filter(grepl("!Sample",X1))%>%
193	filter(!grepl("!Sample_contact",X1))	207	filter(!grepl("!Sample_contact",X1))
194		208
195	##Changing row names and column names:	209	##Changing row names and column names:
196	ALZWORD <- t(alzword)	210	ALZWORD <- t(alzword)
197	rownames(ALZWORD)=NULL	211	rownames(ALZWORD)=NULL
198	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)	212	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
199	ALZWORD <- chngrownm(ALZWORD)[-1,]	213	ALZWORD <- chngrownm(ALZWORD)[-1,]
200	ALZWORD <- ALZWORD%>%	214	ALZWORD <- ALZWORD%>%
201	as.data.frame()%>%	215	as.data.frame()%>%
202	dplyr::select(-starts_with("col"))	216	dplyr::select(-starts_with("col"))
203		217
204	##Reorganizing information within the columns	218	##Reorganizing information within the columns
205	ALZWORDF <- cinfo(ALZWORD)	219	ALZWORDF <- cinfo(ALZWORD)
206		220
207		221
208	#Working with Actual Data part of file	222	#Working with Actual Data part of file
209	alzdat <- alz %>%	223	alzdat <- alz %>%
210	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)	224	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
211	ALZDAT <- t(alzdat[,-1])	225	ALZDAT <- t(alzdat[,-1])
212	rownames(ALZDAT)=NULL	226	rownames(ALZDAT)=NULL
213		227
214	##Is there a clean version of the GPL file available?	228	##Is there a clean version of the GPL file available?
215	gplnum <- strsplit(genena,"[\\\|/]") %>%	229	gplnum <- strsplit(genena,"[\\\|/]") %>%
216	.[[1]] %>%	230	.[[1]] %>%
217	.[length(.)] %>%	231	.[length(.)] %>%
218	gsub("\\D","",.)	232	gsub("\\D","",.)
219	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))	233	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
220	if(clfileex >= 1){	234	if(clfileex >= 1){
221	#use the clean version	235	#use the clean version
222	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%	236	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
223	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")	237	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
224		238
225	}	239	} else if(clfileex == 0){
226	if(clfileex == 0){
227	##Lets Create a clean version	240	##Lets Create a clean version
228		241
229	##Gene ID to Gene Name	242	##Gene ID to Gene Name
230	if(soft == TRUE){	243	if(soft == TRUE){
231	#Check to see if there is already a file containing information on soft files	244	#Check to see if there is already a file containing information on soft files
232	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))	245	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
233	if(fileex == 1){	246	if(fileex == 1){
234	#Check to see if this GPL soft file has been used before	247	#Check to see if this GPL soft file has been used before
235	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	248	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
236	.$GPL_FILE_NUM%>%	249	.$GPL_FILE_NUM%>%
237	grepl(gplnum,.) %>%	250	grepl(gplnum,.) %>%
238	sum()	251	sum()
239	if(IDF == 1){	252	if(IDF == 1){
240	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	253	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
241	.$GPL_FILE_NUM%>%	254	.$GPL_FILE_NUM%>%
242	grep(gplnum,.)	255	grep(gplnum,.)
243	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	256	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
244	.$LOC_ID %>%	257	.$LOC_ID %>%
245	.[IDLOCAL]	258	.[IDLOCAL]
246	geneIDNam <- genena %>%	259	geneIDNam <- genena %>%
247	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%	260	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
248	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	261	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
249	}	262	} else if(IDF == 0){
250	if(IDF == 0){
251	#No information on this particular GPL file	263	#No information on this particular GPL file
252	idLOCGPL <- genena %>%	264	idLOCGPL <- genena %>%
253	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	265	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
254	t(.) %>%	266	t(.) %>%
255	grep("^ID\\s*$",.) %>%	267	grep("^ID\\s*$",.) %>%
256	-1	268	-1
257	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%	269	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
258	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)	270	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
259	geneIDNam <- genena %>%	271	geneIDNam <- genena %>%
260	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	272	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
261	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	273	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
262	}	274	}
263	}	275	} else if(fileex == 0){
264	if(fileex == 0){
265	#We must create a file that we can access for later use	276	#We must create a file that we can access for later use
266	idLOCGPL <- genena %>%	277	idLOCGPL <- genena %>%
267	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	278	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
268	t(.) %>%	279	t(.) %>%
269	grep("^ID\\s*$",.) %>%	280	grep("^ID\\s*$",.) %>%
270	-1	281	-1
271	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))	282	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
272	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")	283	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
273	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)	284	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
274	geneIDNam <- genena %>%	285	geneIDNam <- genena %>%
275	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	286	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
276	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	287	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
277	}	288	}
278	}	289	} else if(soft == FALSE){
279	if(soft == FALSE){
280	geneIDNam <- genena %>%	290	geneIDNam <- genena %>%
281	read_delim(delim="\t",comment = "#")%>%	291	read_delim(delim="\t",comment = "#")%>%
282	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))	292	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
283	}	293	}
284		294
285	##Labeling the gene IDs without names	295	##Labeling the gene IDs without names
286	geneIDNam <- NAFIXING(geneIDNam)	296	geneIDNam <- NAFIXING(geneIDNam)
287		297
288	##remove the whitespace	298	##remove the whitespace
289	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	299	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
290		300
291	##Here is the clean version	301	##Here is the clean version
292	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)	302	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
293	}	303	}
294		304
295		305
296		306
297	##Changing the gene ID to gene name	307	##Changing the gene ID to gene name
298	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))	308	ALZDAT1 <- cgeneID(geneIDNam,alzdat)
299	colnames(ALZDAT) = ALZDAT1[1,]	309	colnames(ALZDAT) = ALZDAT1[1,]
300		310
301		311
302	##Adjusting the column names aka the gene names	312	##Adjusting the column names aka the gene names
303	colnames(ALZDAT) <- gcnames(ALZDAT)	313	colnames(ALZDAT) <- gcnames(ALZDAT)
304		314
305		315
306	#Full RAW Data	316	#Full RAW Data
307	Fullalzdwr <- ALZDAT %>%	317	Fullalzdwr <- ALZDAT %>%
308	as.data.frame() %>%	318	as.data.frame() %>%
309	cbind(ALZWORDF,.)	319	cbind(ALZWORDF,.)
310		320
311		321
312	#Raw file is output	322	#Raw file is output
313	nfnaex <- strsplit(alz,"[\\]") %>%	323	nfnaex <- strsplit(alz,"[\\]") %>%
314	.[[1]] %>%	324	.[[1]] %>%
315	.[length(.)] %>%	325	.[length(.)] %>%
316	gsub("\\D","",.) %>%	326	gsub("\\D","",.) %>%
317	c("GSE",.,"aftexcel.txt") %>%	327	c("GSE",.,"aftexcel.txt") %>%
318	paste(collapse = "")	328	paste(collapse = "")
319	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")	329	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
320		330
321		331
322	#Now for the discretization part	332	#Now for the discretization part
323	##get the wordy part again	333	##get the wordy part again
324	rawword <- t(ALZWORDF)	334	rawword <- t(ALZWORDF)
325		335
326	##where is ID_REF located	336	##where is ID_REF located
327	hereim <- grep("ID_REF",rownames(rawword))	337	hereim <- grep("ID_REF",rownames(rawword))
328		338
329	##Subject Names GSM...	339	##Subject Names GSM...
330	subjnam <- rawword[hereim,]	340	subjnam <- rawword[hereim,]
331		341
332	##Getting the names for the rows	342	##Getting the names for the rows
333	namedarows <- rownames(rawword)[-hereim] %>%	343	namedarows <- rownames(rawword)[-hereim] %>%
334	as.data.frame()	344	as.data.frame()
335	RAWWORD <- rawword[-hereim,] %>%	345	RAWWORD <- rawword[-hereim,] %>%
336	as.data.frame() %>%	346	as.data.frame() %>%
337	bind_cols(namedarows,.)	347	bind_cols(namedarows,.)
338	z <- 1	348	z <- 1
339	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)	349	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
340	for(z in 1:dim(RAWWORD)[1]){	350	for(z in 1:dim(RAWWORD)[1]){
341	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))	351	if(sum(is.na(RAWWORD[z,])) > 0){
342	z <- z + 1	352	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
		353	}
		354	if(length(grep("NA",RAWWORD[z,])) > 0){
		355	naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
		356	}
		357	z <- z + 1
343	}	358	}
344		359
345	colnames(naroww) <- "ROW_NAs"	360	colnames(naroww) <- "ROW_NAs"
346	RAWWORD <- bind_cols(RAWWORD,naroww)	361	RAWWORD <- bind_cols(RAWWORD,naroww)
347		362
348		363
349	roALZna <- t(ALZDAT) %>%	364	roALZna <- t(ALZDAT) %>%
350	rownames(.) %>%	365	rownames(.) %>%
351	as.data.frame(.)	366	as.data.frame(.)
352	colnames(roALZna) <- "ID_REF"	367	colnames(roALZna) <- "ID_REF"
353		368
354	RAWDAT <- t(ALZDAT) %>%	369	RAWDAT <- t(ALZDAT) %>%
355	as.data.frame(.)	370	as.data.frame(.)
356	colnames(RAWDAT) <- NULL	371	colnames(RAWDAT) <- NULL
357	rownames(RAWDAT) <- NULL	372	rownames(RAWDAT) <- NULL
358		373
359	RAWDAT2 <- RAWDAT %>%	374	RAWDAT2 <- RAWDAT %>%
360	cbind(roALZna,.) %>%	375	cbind(roALZna,.) %>%
361	dplyr::arrange(.,ID_REF)	376	dplyr::arrange(.,ID_REF)
362		377
363	##Editing the file for R processing	378	##Editing the file for R processing
364	RAWDATID <- RAWDAT2[,1] %>%	379	RAWDATID <- RAWDAT2[,1] %>%
365	as.matrix(.)	380	as.matrix(.)
366		381
367	RAWDATNUM <- RAWDAT2[,-1] %>%	382	RAWDATNUM <- RAWDAT2[,-1] %>%
368	mapply(.,FUN = as.numeric) %>%	383	mapply(.,FUN = as.numeric) %>%
369	t(.)	384	t(.)
370		385
371	##Consolidating genes with the same name	386	##Consolidating genes with the same name
372	###create empty matrix of size equal to tabRDATID	387	###create empty matrix of size equal to tabRDATID
373	tabRDATID <- table(RAWDATID)	388	tabRDATID <- table(RAWDATID)
374	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))	389	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
375	j <- 1	390	j <- 1
376	for(j in 1:length(tabRDATID)){	391	for(j in 1:length(tabRDATID)){
377		392
378	##Putting the ones without duplicates in their new homes	393	##Putting the ones without duplicates in their new homes
379	if(tabRDATID[j] == 1){	394	if(tabRDATID[j] == 1){
380	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]	395	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
381	}	396	} else if(tabRDATID[j] > 1){
382	##Averaging duplicates and putting them in their new homes	397	##Averaging duplicates and putting them in their new homes
383	if(tabRDATID[j] > 1){
384	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)	398	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
385	}	399	}
386	j <- j + 1	400	j <- j + 1
387	}	401	}
388		402
389	##Scaling the Data	403	##Scaling the Data
390	scrawdat <- NuRDATN%>%	404	scrawdat <- NuRDATN%>%
391	scale()	405	scale()
392	attr(scrawdat,"scaled:center") <- NULL	406	attr(scrawdat,"scaled:center") <- NULL
393	attr(scrawdat,"scaled:scale") <- NULL	407	attr(scrawdat,"scaled:scale") <- NULL
394	colnames(scrawdat) <- rownames(tabRDATID)	408	colnames(scrawdat) <- rownames(tabRDATID)
395		409
396	##Discretized the Data	410	##Discretized the Data
397	dialzdat <- scrawdat %>%	411	dialzdat <- scrawdat %>%
398	dndat(.) %>%	412	dndat(.) %>%
399	t()%>%	413	t()%>%
400	as.data.frame(.)	414	as.data.frame(.)
401	colnames(dialzdat) <- rownames(RAWDATNUM)	415	colnames(dialzdat) <- rownames(RAWDATNUM)
402		416
403	##setting "ID_REF" as a new variable	417	##setting "ID_REF" as a new variable
404	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))	418	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
405	colnames(geneNAM) <- "ID_REF"	419	colnames(geneNAM) <- "ID_REF"
406	rownames(dialzdat) <- NULL	420	rownames(dialzdat) <- NULL
407	dialzdat <-bind_cols(geneNAM,dialzdat)	421	dialzdat <-bind_cols(geneNAM,dialzdat)
408		422
409	##NAs in a column	423	##NAs in a column
410	x <- 2	424	x <- 2
411	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)	425	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
412	nacol[1,1] = "COL_NAs"	426	nacol[1,1] = "COL_NAs"
413	for(x in 2:dim(dialzdat)[2]){	427	for(x in 2:dim(dialzdat)[2]){
414	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))	428	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
415	x <- x + 1	429	x <- x + 1
416	}	430	}
417	colnames(nacol) <- colnames(dialzdat)	431	colnames(nacol) <- colnames(dialzdat)
418	dialzdat<-bind_rows(dialzdat,nacol)	432	dialzdat<-bind_rows(dialzdat,nacol)
419		433
420	##NAs in a row	434	##NAs in a row
421	y <- 1	435	y <- 1
422	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)	436	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
423	for(y in 1:dim(dialzdat)[1]){	437	for(y in 1:dim(dialzdat)[1]){
424	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))	438	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
425	y <- y + 1	439	y <- y + 1
426	}	440	}
427	colnames(narowd) <- "ROW_NAs"	441	colnames(narowd) <- "ROW_NAs"
428	dialzdat <- bind_cols(dialzdat,narowd)	442	dialzdat <- bind_cols(dialzdat,narowd)
429	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam	443	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
430	colnames(RAWWORD) <- colnames(dialzdat)	444	colnames(RAWWORD) <- colnames(dialzdat)
431	##converting to character so that the clinical can be brought together with discrete data	445	##converting to character so that the clinical can be brought together with discrete data
432	k <- 2	446	k <- 2
433	for(k in 2:dim(dialzdat)[2]-1){	447	for(k in 2:dim(dialzdat)[2]-1){
434	dialzdat[,k] <- as.character(dialzdat[,k])	448	dialzdat[,k] <- as.character(dialzdat[,k])