Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 83db0077e429f9efd63267d3dee589fbaf5c85a8

Authored by Efrain Gonzalez 2017-06-19 12:53:33 -0400

1 parent e340baf086

Exists in master

Moved to new folder

Showing 2 changed files Show diff stats

RCode/RClean2.R

Diff comments View file @ 83db007

1	#Libraries required to run the code	1	#Libraries required to run the code
2	library(pryr)	2	library(pryr)
3	library(MASS)	3	library(MASS)
4	library(dplyr)	4	library(dplyr)
5	library(tidyr)	5	library(tidyr)
6	library(readr)	6	library(readr)
7	library(stringr)	7	library(stringr)
8		8
9		9
10	#Necessary Functions	10	#Necessary Functions
11	#1#Function for handling the changing of row names and column names	11	#1#Function for handling the changing of row names and column names
12	chngrownm <- function(mat){	12	chngrownm <- function(mat){
13	row <- dim(mat)[1]	13	row <- dim(mat)[1]
14	col <- dim(mat)[2]	14	col <- dim(mat)[2]
15	j <- 1	15	j <- 1
16	x <- 1	16	x <- 1
17	p <- 1	17	p <- 1
18	a <- 1	18	a <- 1
19	b <- 1	19	b <- 1
20	g <- 1	20	g <- 1
21	for(j in 1:col){	21	for(j in 1:col){
22	if("!Sample_source_name_ch1"==mat[1,j]){	22	if("!Sample_source_name_ch1"==mat[1,j]){
23	colnames(mat)[j] <- "Brain_Region"	23	colnames(mat)[j] <- "Brain_Region"
24	}	24	}
25	if("!Sample_title" == mat[1,j]){	25	if("!Sample_title" == mat[1,j]){
26	colnames(mat)[j] <- "Title"	26	colnames(mat)[j] <- "Title"
27	}	27	}
28	if("!Sample_geo_accession" == mat[1,j]){	28	if("!Sample_geo_accession" == mat[1,j]){
29	colnames(mat)[j] <- "ID_REF"	29	colnames(mat)[j] <- "ID_REF"
30	} else{	30	} else{
31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
32	colnames(mat)[j] <- paste0("Sex",x)	32	colnames(mat)[j] <- paste0("Sex",x)
33	x = x + 1	33	x = x + 1
34	}	34	}
35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
36	colnames(mat)[j] <- paste0("PMI",p)	36	colnames(mat)[j] <- paste0("PMI",p)
37	p = p + 1	37	p = p + 1
38	}	38	}
39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
40	colnames(mat)[j] <- paste0("Age",a)	40	colnames(mat)[j] <- paste0("Age",a)
41	a = a + 1	41	a = a + 1
42	}	42	}
43	if(grepl("braak\|b&b",mat[2,j])==TRUE){	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
44	colnames(mat)[j] <- paste0("Braak",b)	44	colnames(mat)[j] <- paste0("Braak",b)
45	b = b + 1	45	b = b + 1
46	}	46	}
47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
48	colnames(mat)[j] <- paste0("Group",g)	48	colnames(mat)[j] <- paste0("Group",g)
49	g = g + 1	49	g = g + 1
50	}	50	}
51		51
52	}	52	}
53	j = j + 1	53	j = j + 1
54	}	54	}
55	mat	55	mat
56	}	56	}
57		57
58	#2#Function for reorganizing information within the columns	58	#2#Function for reorganizing information within the columns
59	cinfo <- function(mat){	59	cinfo <- function(mat){
60	col <- dim(mat)[2]	60	col <- dim(mat)[2]
61	j <-2	61	j <-2
62	for(j in 2:col){	62	for(j in 2:col){
63	if(grepl("Group",colnames(mat)[j]) == TRUE){	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
65	}	65	}
66	if(grepl("Age",colnames(mat)[j])==TRUE){	66	if(grepl("Age",colnames(mat)[j])==TRUE){
67	mat[,j] <- gsub("\\D","",mat[,j])%>%	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
68	as.integer()	68	as.integer()
69	}	69	}
70	if(grepl("Sex",colnames(mat)[j])==TRUE){	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
71	mat[,j] <- gsub(".+:\\s","",mat[,j])	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
72	}	72	}
73	if(grepl("PMI",colnames(mat)[j])==TRUE){	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75	as.numeric()	75	as.numeric()
76	}	76	}
77	if(grepl("Braak",colnames(mat)[j])==TRUE){	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79	as.roman()%>%	79	as.roman()%>%
80	as.integer()	80	as.integer()
81	}	81	}
82	j=j+1	82	j=j+1
83	}	83	}
84	mat	84	mat
85	}	85	}
86		86
87	#3#Function for labeling the gene IDs without names	87	#3#Function for labeling the gene IDs without names
88	NAFIXING <- function(GIDNAM){	88	NAFIXING <- function(GIDNAM){
89	row <- dim(GIDNAM)[1]	89	row <- dim(GIDNAM)[1]
90	i <- 1	90	i <- 1
91	for(i in 1:row){	91	for(i in 1:row){
92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){	92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
93	GIDNAM[i,2] <- GIDNAM[i,1]	93	GIDNAM[i,2] <- GIDNAM[i,1]
94	}	94	}
95	i <- i + 1	95	i <- i + 1
96	}	96	}
97	GIDNAM	97	GIDNAM
98	}	98	}
99		99
100	#4#Function for changing the gene ID to gene name	100	#4#Function for changing the gene ID to gene name
101	cgeneID <- function(GeneName,DATA){	101	cgeneID <- function(GeneName,DATA){
102	colGene <- dim(GeneName)[2]	102	colGene <- dim(GeneName)[2]
103	j <- 1	103	j <- 1
104	for(j in 1:colGene){	104	for(j in 1:colGene){
105	chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])	105	chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])
106	DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])	106	DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
107	j = j+1	107	j = j+1
108	}	108	}
109	DATA	109	DATA
110	}	110	}
111		111
112	#5#Function for adjusting the gene names	112	#5#Function for adjusting the gene names
113	gcnames <- function(DiData,usecol=1){	113	gcnames <- function(DiData,usecol=1){
114	nuruns <- dim(DiData)[2]	114	nuruns <- dim(DiData)[2]
115	i = 1	115	i = 1
116	nwnam <- rep("0",length.out=nuruns)	116	nwnam <- rep("0",length.out=nuruns)
117	for(i in 1:nuruns){	117	for(i in 1:nuruns){
118	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){	118	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
119	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]	119	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
120	} else{	120	} else{
121	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]	121	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
122	}	122	}
123		123
124	}	124	}
125	nwnam	125	nwnam
126		126
127	}	127	}
128		128
129		129
130		130
131	#The Rest of this code will be used every time you want to change a data set	131	#The Rest of this code will be used every time you want to change a data set
132		132
133	#Getting the series matrix file	133	#Getting the series matrix file
134	print("Choose the series matrix file that you want to Analyze")	134	print("Choose the series matrix file that you want to Analyze")
135	alz <- file.choose()	135	alz <- file.choose()
136		136
137	#Getting the GPL file	137	#Getting the GPL file
138	print("Choose the GPL file that correlates with the above series matrix file")	138	print("Choose the GPL file that correlates with the above series matrix file")
139	genena <- file.choose()	139	genena <- file.choose()
140		140
141		141
142	#Set working directory based on the directory of the series matrix file Currently only works for windows	142	#Set working directory based on the directory of the series matrix file Currently only works for windows
143	##strsplit(alz,"[\\]") %>%	143	##strsplit(alz,"[\\]") %>%
144	## .[[1]] %>%	144	## .[[1]] %>%
145	## .[-length(.)] %>%	145	## .[-length(.)] %>%
146	## paste(.,collapse="/") %>%	146	## paste(.,collapse="/") %>%
147	## setwd()	147	## setwd()
148		148
149	#Find out if it is a soft GPL file or not	149	#Find out if it is a soft GPL file or not
150	soft <- strsplit(genena,"[\\\|/]") %>%	150	soft <- strsplit(genena,"[\\\|/]") %>%
151	.[[1]] %>%	151	.[[1]] %>%
152	.[length(.)] %>%	152	.[length(.)] %>%
153	grepl("soft",.)	153	grepl("soft",.)
154		154
155	#Working with the wordy part of the document	155	#Working with the wordy part of the document
156	alzword <- alz %>%	156	alzword <- alz %>%
157	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%	157	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
158	filter(grepl("!Sample",X1))%>%	158	filter(grepl("!Sample",X1))%>%
159	filter(!grepl("!Sample_contact",X1))	159	filter(!grepl("!Sample_contact",X1))
160		160
161	##Changing row names and column names:	161	##Changing row names and column names:
162	ALZWORD <- t(alzword)	162	ALZWORD <- t(alzword)
163	rownames(ALZWORD)=NULL	163	rownames(ALZWORD)=NULL
164	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)	164	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
165	ALZWORD <- chngrownm(ALZWORD)[-1,]	165	ALZWORD <- chngrownm(ALZWORD)[-1,]
166	ALZWORD <- ALZWORD%>%	166	ALZWORD <- ALZWORD%>%
167	as.data.frame()%>%	167	as.data.frame()%>%
168	dplyr::select(-starts_with("col"))	168	dplyr::select(-starts_with("col"))
169		169
170	##Reorganizing information within the columns	170	##Reorganizing information within the columns
171	ALZWORDF <- cinfo(ALZWORD)	171	ALZWORDF <- cinfo(ALZWORD)
172		172
173		173
174	#Working with Actual Data part of file	174	#Working with Actual Data part of file
175	alzdat <- alz %>%	175	alzdat <- alz %>%
176	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)	176	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
177	ALZDAT <- t(alzdat[,-1])	177	ALZDAT <- t(alzdat[,-1])
178	rownames(ALZDAT)=NULL	178	rownames(ALZDAT)=NULL
179		179
180		180
181	##Gene ID to Gene Name	181	##Gene ID to Gene Name
182	###geneIDNam <- genena %>%	182	###geneIDNam <- genena %>%
183	### read_delim(delim="\t",comment = "#")%>%	183	### read_delim(delim="\t",comment = "#")%>%
184	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	184	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
185	###problems with the above for soft files	185	###problems with the above for soft files
186	if(soft == TRUE){	186	if(soft == TRUE){
187	gplnum <- strsplit(genena,"[\\\|/]") %>%	187	gplnum <- strsplit(genena,"[\\\|/]") %>%
188	.[[1]] %>%	188	.[[1]] %>%
189	.[length(.)] %>%	189	.[length(.)] %>%
190	gsub("\\D","",.)	190	gsub("\\D","",.)
191	#Check to see if there is already a file containing information on soft files	191	#Check to see if there is already a file containing information on soft files
192	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))	192	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
193	if(fileex == 1){	193	if(fileex == 1){
194	#Check to see if this GPL soft file has been used before	194	#Check to see if this GPL soft file has been used before
195	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	195	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
196	.$GPL_FILE_NUM%>%	196	.$GPL_FILE_NUM%>%
197	grepl(gplnum,.) %>%	197	grepl(gplnum,.) %>%
198	sum()	198	sum()
199	if(IDF == 1){	199	if(IDF == 1){
200	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	200	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
201	.$GPL_FILE_NUM%>%	201	.$GPL_FILE_NUM%>%
202	grep(gplnum,.)	202	grep(gplnum,.)
203	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	203	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
204	.$LOC_ID %>%	204	.$LOC_ID %>%
205	.[IDLOCAL]	205	.[IDLOCAL]
206	geneIDNam <- genena %>%	206	geneIDNam <- genena %>%
207	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%	207	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
208	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	208	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
209	}	209	}
210	if(IDF == 0){	210	if(IDF == 0){
211	#No information on this particular GPL file	211	#No information on this particular GPL file
212	idLOCGPL <- genena %>%	212	idLOCGPL <- genena %>%
213	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	213	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
214	t(.) %>%	214	t(.) %>%
215	grep("^\\D",.) %>%	215	grep("^\\D",.) %>%
216	length()-1	216	length()-1
217	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%	217	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
218	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)	218	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
219	geneIDNam <- genena %>%	219	geneIDNam <- genena %>%
220	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	220	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
221	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	221	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
222	}	222	}
223	}	223	}
224	if(fileex == 0){	224	if(fileex == 0){
225	#We must create a file that we can access for later use	225	#We must create a file that we can access for later use
226	idLOCGPL <- genena %>%	226	idLOCGPL <- genena %>%
227	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	227	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
228	t(.) %>%	228	t(.) %>%
229	grep("^\\D",.) %>%	229	grep("^\\D",.) %>%
230	length()-1	230	length()-1
231	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))	231	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
232	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")	232	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
233	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)	233	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
234	geneIDNam <- genena %>%	234	geneIDNam <- genena %>%
235	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	235	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
236	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	236	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
237	}	237	}
238	}	238	}
239	if(soft == FALSE){	239	if(soft == FALSE){
240	geneIDNam <- genena %>%	240	geneIDNam <- genena %>%
241	read_delim(delim="\t",comment = "#")%>%	241	read_delim(delim="\t",comment = "#")%>%
242	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	242	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
243	}	243	}
244		244
245	##Labeling the gene IDs without names	245	##Labeling the gene IDs without names
246	geneIDNam <- NAFIXING(geneIDNam)	246	geneIDNam <- NAFIXING(geneIDNam)
247		247
248	##remove the whitespace	248	##remove the whitespace
249	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	249	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
250		250
251	##Changing the gene ID to gene name	251	##Changing the gene ID to gene name
252	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))	252	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
253	colnames(ALZDAT) = ALZDAT1[1,]	253	colnames(ALZDAT) = ALZDAT1[1,]
254		254
255		255
256	##Adjusting the column names aka the gene names	256	##Adjusting the column names aka the gene names
257	colnames(ALZDAT) <- gcnames(ALZDAT)	257	colnames(ALZDAT) <- gcnames(ALZDAT)
258		258
259		259
260	#Full Data	260	#Full Data
261	Fullalzdw <- ALZDAT %>%	261	Fullalzdw <- ALZDAT %>%
262	as.data.frame() %>%	262	as.data.frame() %>%
263	cbind(ALZWORDF,.)	263	cbind(ALZWORDF,.)
264		264
265		265
266	#nfna <- strsplit(alz,"[\\]") %>%	266	#nfna <- strsplit(alz,"[\\]") %>%
267	# .[[1]] %>%	267	# .[[1]] %>%
268	# .[length(.)] %>%	268	# .[length(.)] %>%
269	# gsub("\\D","",.) %>%	269	# gsub("\\D","",.) %>%
270	# c("GSE",.,"after.txt") %>%	270	# c("GSE",.,"after.txt") %>%
271	# paste(collapse = "")	271	# paste(collapse = "")
272	#write.matrix(Fullalzdw,file = nfna,sep = "\t")	272	#write.matrix(Fullalzdw,file = nfna,sep = "\t")
273	#Perfect for excel viewing	273	#Perfect for excel viewing
274	nfnaex <- strsplit(alz,"[\\]") %>%	274	nfnaex <- strsplit(alz,"[\\]") %>%
275	.[[1]] %>%	275	.[[1]] %>%
276	.[length(.)] %>%	276	.[length(.)] %>%
277	gsub("\\D","",.) %>%	277	gsub("\\D","",.) %>%
278	c("GSE",.,"aftexcel.txt") %>%	278	c("GSE",.,"aftexcel.txt") %>%
279	paste(collapse = "")	279	paste(collapse = "")
280	write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)	280	write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)
281		281
282		282
283		283

RCode/RClean3.R

Diff comments View file @ 83db007

1	#Libraries required to run the code	1	#Libraries required to run the code
2	library(pryr)	2	library(pryr)
3	library(MASS)	3	library(MASS)
4	library(dplyr)	4	library(dplyr)
5	library(tidyr)	5	library(tidyr)
6	library(readr)	6	library(readr)
7	library(stringr)	7	library(stringr)
8		8
9		9
10	#Necessary Functions	10	#Necessary Functions
11	#1#Function for handling the changing of row names and column names	11	#1#Function for handling the changing of row names and column names
12	chngrownm <- function(mat){	12	chngrownm <- function(mat){
13	row <- dim(mat)[1]	13	row <- dim(mat)[1]
14	col <- dim(mat)[2]	14	col <- dim(mat)[2]
15	j <- 1	15	j <- 1
16	x <- 1	16	x <- 1
17	p <- 1	17	p <- 1
18	a <- 1	18	a <- 1
19	b <- 1	19	b <- 1
20	g <- 1	20	g <- 1
21	for(j in 1:col){	21	for(j in 1:col){
22	if("!Sample_source_name_ch1"==mat[1,j]){	22	if("!Sample_source_name_ch1"==mat[1,j]){
23	colnames(mat)[j] <- "Brain_Region"	23	colnames(mat)[j] <- "Brain_Region"
24	}	24	}
25	if("!Sample_title" == mat[1,j]){	25	if("!Sample_title" == mat[1,j]){
26	colnames(mat)[j] <- "Title"	26	colnames(mat)[j] <- "Title"
27	}	27	}
28	if("!Sample_geo_accession" == mat[1,j]){	28	if("!Sample_geo_accession" == mat[1,j]){
29	colnames(mat)[j] <- "ID_REF"	29	colnames(mat)[j] <- "ID_REF"
30	} else{	30	} else{
31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){	31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
32	colnames(mat)[j] <- paste0("Sex",x)	32	colnames(mat)[j] <- paste0("Sex",x)
33	x = x + 1	33	x = x + 1
34	}	34	}
35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){	35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
36	colnames(mat)[j] <- paste0("PMI",p)	36	colnames(mat)[j] <- paste0("PMI",p)
37	p = p + 1	37	p = p + 1
38	}	38	}
39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){	39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
40	colnames(mat)[j] <- paste0("Age",a)	40	colnames(mat)[j] <- paste0("Age",a)
41	a = a + 1	41	a = a + 1
42	}	42	}
43	if(grepl("braak\|b&b",mat[2,j])==TRUE){	43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
44	colnames(mat)[j] <- paste0("Braak",b)	44	colnames(mat)[j] <- paste0("Braak",b)
45	b = b + 1	45	b = b + 1
46	}	46	}
47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){	47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
48	colnames(mat)[j] <- paste0("Group",g)	48	colnames(mat)[j] <- paste0("Group",g)
49	g = g + 1	49	g = g + 1
50	}	50	}
51		51
52	}	52	}
53	j = j + 1	53	j = j + 1
54	}	54	}
55	mat	55	mat
56	}	56	}
57		57
58	#2#Function for reorganizing information within the columns	58	#2#Function for reorganizing information within the columns
59	cinfo <- function(mat){	59	cinfo <- function(mat){
60	col <- dim(mat)[2]	60	col <- dim(mat)[2]
61	j <-2	61	j <-2
62	for(j in 2:col){	62	for(j in 2:col){
63	if(grepl("Group",colnames(mat)[j]) == TRUE){	63	if(grepl("Group",colnames(mat)[j]) == TRUE){
64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])	64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
65	}	65	}
66	if(grepl("Age",colnames(mat)[j])==TRUE){	66	if(grepl("Age",colnames(mat)[j])==TRUE){
67	mat[,j] <- gsub("\\D","",mat[,j])%>%	67	mat[,j] <- gsub("\\D","",mat[,j])%>%
68	as.integer()	68	as.integer()
69	}	69	}
70	if(grepl("Sex",colnames(mat)[j])==TRUE){	70	if(grepl("Sex",colnames(mat)[j])==TRUE){
71	mat[,j] <- gsub(".+:\\s","",mat[,j])	71	mat[,j] <- gsub(".+:\\s","",mat[,j])
72	}	72	}
73	if(grepl("PMI",colnames(mat)[j])==TRUE){	73	if(grepl("PMI",colnames(mat)[j])==TRUE){
74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%	74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75	as.numeric()	75	as.numeric()
76	}	76	}
77	if(grepl("Braak",colnames(mat)[j])==TRUE){	77	if(grepl("Braak",colnames(mat)[j])==TRUE){
78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%	78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79	as.roman()%>%	79	as.roman()%>%
80	as.integer()	80	as.integer()
81	}	81	}
82	j=j+1	82	j=j+1
83	}	83	}
84	mat	84	mat
85	}	85	}
86		86
87	#3#Function for labeling the gene IDs without names	87	#3#Function for labeling the gene IDs without names
88	NAFIXING <- function(GIDNAM){	88	NAFIXING <- function(GIDNAM){
89	row <- dim(GIDNAM)[1]	89	row <- dim(GIDNAM)[1]
90	i <- 1	90	i <- 1
91	for(i in 1:row){	91	for(i in 1:row){
92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){	92	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
93	GIDNAM[i,2] <- GIDNAM[i,1]	93	GIDNAM[i,2] <- GIDNAM[i,1]
94	}	94	}
95	i <- i + 1	95	i <- i + 1
96	}	96	}
97	GIDNAM	97	GIDNAM
98	}	98	}
99		99
100	#4#Function for changing the gene ID to gene name	100	#4#Function for changing the gene ID to gene name
101	cgeneID <- function(GeneName,DATA){	101	cgeneID <- function(GeneName,DATA){
102	colGene <- dim(GeneName)[2]	102	colGene <- dim(GeneName)[2]
103	j <- 1	103	j <- 1
104	for(j in 1:colGene){	104	for(j in 1:colGene){
105	chngsreq <- grep(GeneName[1,j],DATA[1,])	105	chngsreq <- grep(GeneName[1,j],DATA[1,])
106	#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])	106	#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
107	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])	107	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
108	j = j+1	108	j = j+1
109	}	109	}
110	DATA	110	DATA
111	}	111	}
112		112
113	#5#Function for adjusting the gene names	113	#5#Function for adjusting the gene names
114	gcnames <- function(DiData,usecol=1){	114	gcnames <- function(DiData,usecol=1){
115	nuruns <- dim(DiData)[2]	115	nuruns <- dim(DiData)[2]
116	i = 1	116	i = 1
117	nwnam <- rep("0",length.out=nuruns)	117	nwnam <- rep("0",length.out=nuruns)
118	for(i in 1:nuruns){	118	for(i in 1:nuruns){
119	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){	119	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
120	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]	120	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
121	} else{	121	} else{
122	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]	122	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
123	}	123	}
124		124
125	}	125	}
126	nwnam	126	nwnam
127		127
128	}	128	}
129		129
130		130
131		131
132	#The Rest of this code will be used every time you want to change a data set	132	#The Rest of this code will be used every time you want to change a data set
133		133
134	#Getting the series matrix file	134	#Getting the series matrix file
135	print("Choose the series matrix file that you want to Analyze")	135	print("Choose the series matrix file that you want to Analyze")
136	alz <- file.choose()	136	alz <- file.choose()
137		137
138	#Getting the GPL file	138	#Getting the GPL file
139	print("Choose the GPL file that correlates with the above series matrix file")	139	print("Choose the GPL file that correlates with the above series matrix file")
140	genena <- file.choose()	140	genena <- file.choose()
141		141
142		142
143	#Set working directory based on the directory of the series matrix file Currently only works for windows	143	#Set working directory based on the directory of the series matrix file Currently only works for windows
144	##strsplit(alz,"[\\]") %>%	144	##strsplit(alz,"[\\]") %>%
145	## .[[1]] %>%	145	## .[[1]] %>%
146	## .[-length(.)] %>%	146	## .[-length(.)] %>%
147	## paste(.,collapse="/") %>%	147	## paste(.,collapse="/") %>%
148	## setwd()	148	## setwd()
149		149
150	#Find out if it is a soft GPL file or not	150	#Find out if it is a soft GPL file or not
151	soft <- strsplit(genena,"[\\\|/]") %>%	151	soft <- strsplit(genena,"[\\\|/]") %>%
152	.[[1]] %>%	152	.[[1]] %>%
153	.[length(.)] %>%	153	.[length(.)] %>%
154	grepl("soft",.)	154	grepl("soft",.)
155		155
156	#Working with the wordy part of the document	156	#Working with the wordy part of the document
157	alzword <- alz %>%	157	alzword <- alz %>%
158	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%	158	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
159	filter(grepl("!Sample",X1))%>%	159	filter(grepl("!Sample",X1))%>%
160	filter(!grepl("!Sample_contact",X1))	160	filter(!grepl("!Sample_contact",X1))
161		161
162	##Changing row names and column names:	162	##Changing row names and column names:
163	ALZWORD <- t(alzword)	163	ALZWORD <- t(alzword)
164	rownames(ALZWORD)=NULL	164	rownames(ALZWORD)=NULL
165	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)	165	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
166	ALZWORD <- chngrownm(ALZWORD)[-1,]	166	ALZWORD <- chngrownm(ALZWORD)[-1,]
167	ALZWORD <- ALZWORD%>%	167	ALZWORD <- ALZWORD%>%
168	as.data.frame()%>%	168	as.data.frame()%>%
169	dplyr::select(-starts_with("col"))	169	dplyr::select(-starts_with("col"))
170		170
171	##Reorganizing information within the columns	171	##Reorganizing information within the columns
172	ALZWORDF <- cinfo(ALZWORD)	172	ALZWORDF <- cinfo(ALZWORD)
173		173
174		174
175	#Working with Actual Data part of file	175	#Working with Actual Data part of file
176	alzdat <- alz %>%	176	alzdat <- alz %>%
177	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)	177	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
178	ALZDAT <- t(alzdat[,-1])	178	ALZDAT <- t(alzdat[,-1])
179	rownames(ALZDAT)=NULL	179	rownames(ALZDAT)=NULL
180		180
181	##Is there a clean version of the GPL file available?	181	##Is there a clean version of the GPL file available?
182	gplnum <- strsplit(genena,"[\\\|/]") %>%	182	gplnum <- strsplit(genena,"[\\\|/]") %>%
183	.[[1]] %>%	183	.[[1]] %>%
184	.[length(.)] %>%	184	.[length(.)] %>%
185	gsub("\\D","",.)	185	gsub("\\D","",.)
186	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))	186	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
187	if(clfileex >= 1){	187	if(clfileex >= 1){
188	#use the clean version	188	#use the clean version
189	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%	189	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
190	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")	190	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
191		191
192	}	192	}
193	if(clfileex == 0){	193	if(clfileex == 0){
194	##Lets Create a clean version	194	##Lets Create a clean version
195		195
196	##Gene ID to Gene Name	196	##Gene ID to Gene Name
197	###geneIDNam <- genena %>%	197	###geneIDNam <- genena %>%
198	### read_delim(delim="\t",comment = "#")%>%	198	### read_delim(delim="\t",comment = "#")%>%
199	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	199	### dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
200	###problems with the above for soft files	200	###problems with the above for soft files
201	if(soft == TRUE){	201	if(soft == TRUE){
202	#gplnum <- strsplit(genena,"[\\\|/]") %>%	202	#gplnum <- strsplit(genena,"[\\\|/]") %>%
203	# .[[1]] %>%	203	# .[[1]] %>%
204	# .[length(.)] %>%	204	# .[length(.)] %>%
205	# gsub("\\D","",.)	205	# gsub("\\D","",.)
206	#Check to see if there is already a file containing information on soft files	206	#Check to see if there is already a file containing information on soft files
207	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))	207	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
208	if(fileex == 1){	208	if(fileex == 1){
209	#Check to see if this GPL soft file has been used before	209	#Check to see if this GPL soft file has been used before
210	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	210	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
211	.$GPL_FILE_NUM%>%	211	.$GPL_FILE_NUM%>%
212	grepl(gplnum,.) %>%	212	grepl(gplnum,.) %>%
213	sum()	213	sum()
214	if(IDF == 1){	214	if(IDF == 1){
215	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	215	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
216	.$GPL_FILE_NUM%>%	216	.$GPL_FILE_NUM%>%
217	grep(gplnum,.)	217	grep(gplnum,.)
218	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%	218	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
219	.$LOC_ID %>%	219	.$LOC_ID %>%
220	.[IDLOCAL]	220	.[IDLOCAL]
221	geneIDNam <- genena %>%	221	geneIDNam <- genena %>%
222	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%	222	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
223	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	223	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
224	}	224	}
225	if(IDF == 0){	225	if(IDF == 0){
226	#No information on this particular GPL file	226	#No information on this particular GPL file
227	idLOCGPL <- genena %>%	227	idLOCGPL <- genena %>%
228	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	228	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
229	t(.) %>%	229	t(.) %>%
230	grep("^\\D",.) %>%	230	grep("^\\D",.) %>%
231	length()-1	231	length()-1
232	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%	232	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
233	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)	233	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
234	geneIDNam <- genena %>%	234	geneIDNam <- genena %>%
235	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	235	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
236	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	236	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
237	}	237	}
238	}	238	}
239	if(fileex == 0){	239	if(fileex == 0){
240	#We must create a file that we can access for later use	240	#We must create a file that we can access for later use
241	idLOCGPL <- genena %>%	241	idLOCGPL <- genena %>%
242	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%	242	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
243	t(.) %>%	243	t(.) %>%
244	grep("^\\D",.) %>%	244	grep("^\\D",.) %>%
245	length()-1	245	length()-1
246	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))	246	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
247	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")	247	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
248	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)	248	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
249	geneIDNam <- genena %>%	249	geneIDNam <- genena %>%
250	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%	250	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
251	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	251	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
252	}	252	}
253	}	253	}
254	if(soft == FALSE){	254	if(soft == FALSE){
255	geneIDNam <- genena %>%	255	geneIDNam <- genena %>%
256	read_delim(delim="\t",comment = "#")%>%	256	read_delim(delim="\t",comment = "#")%>%
257	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))	257	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
258	}	258	}
259		259
260	##Labeling the gene IDs without names	260	##Labeling the gene IDs without names
261	geneIDNam <- NAFIXING(geneIDNam)	261	geneIDNam <- NAFIXING(geneIDNam)
262		262
263	##remove the whitespace	263	##remove the whitespace
264	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))	264	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
265		265
266	##Here is the clean version	266	##Here is the clean version
267	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)	267	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
268	}	268	}
269		269
270		270
271		271
272	##Changing the gene ID to gene name	272	##Changing the gene ID to gene name
273	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))	273	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
274	colnames(ALZDAT) = ALZDAT1[1,]	274	colnames(ALZDAT) = ALZDAT1[1,]
275		275
276		276
277	##Adjusting the column names aka the gene names	277	##Adjusting the column names aka the gene names
278	colnames(ALZDAT) <- gcnames(ALZDAT)	278	colnames(ALZDAT) <- gcnames(ALZDAT)
279		279
280		280
281	#Full Data	281	#Full Data
282	Fullalzdw <- ALZDAT %>%	282	Fullalzdw <- ALZDAT %>%
283	as.data.frame() %>%	283	as.data.frame() %>%
284	cbind(ALZWORDF,.)	284	cbind(ALZWORDF,.)
285		285
286		286
287	#nfna <- strsplit(alz,"[\\\|/]") %>%	287	#nfna <- strsplit(alz,"[\\\|/]") %>%
288	# .[[1]] %>%	288	# .[[1]] %>%
289	# .[length(.)] %>%	289	# .[length(.)] %>%
290	# gsub("\\D","",.) %>%	290	# gsub("\\D","",.) %>%
291	# c("GSE",.,"after.txt") %>%	291	# c("GSE",.,"after.txt") %>%
292	# paste(collapse = "")	292	# paste(collapse = "")
293	#write.matrix(Fullalzdw,file = nfna,sep = "\t")	293	#write.matrix(Fullalzdw,file = nfna,sep = "\t")
294		294
295	#Perfect for excel viewing	295	#Perfect for excel viewing
296	nfnaex <- strsplit(alz,"[\\]") %>%	296	nfnaex <- strsplit(alz,"[\\]") %>%
297	.[[1]] %>%	297	.[[1]] %>%
298	.[length(.)] %>%	298	.[length(.)] %>%
299	gsub("\\D","",.) %>%	299	gsub("\\D","",.) %>%
300	c("GSE",.,"aftexcel.txt") %>%	300	c("GSE",.,"aftexcel.txt") %>%
301	paste(collapse = "")	301	paste(collapse = "")
302	write.table(t(Fullalzdw), file = nfnaex, sep = "\t")	302	write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
303		303