Efrain Gonzalez / Cleaning and Fixing Data with R

Commit adfed316993d072e749a8cd85434fc667c054f22

Authored by Efrain Gonzalez 2017-06-16 13:42:58 -0400

Exists in master

An automated version of the RCleanDscret.R

Working on outputting more insightful errors and warnings. (UNTESTED)

Showing 1 changed file with 752 additions and 0 deletions Show diff stats

RAutoClDs.R

Diff comments View file @ adfed31

File was created	1	#Efrain H. Gonzalez
	2	#6/19/2017
	3	#Libraries required to run the code
	4	library(pryr)
	5	library(MASS)
	6	library(dplyr)
	7	library(tidyr)
	8	library(readr)
	9	library(stringr)
	10
	11
	12	#Necessary Functions
	13	#1#Function for handling the changing of row names and column names
	14	chngrownm <- function(mat){
	15	row <- dim(mat)[1]
	16	col <- dim(mat)[2]
	17	j <- 1
	18	x <- 1
	19	p <- 1
	20	a <- 1
	21	b <- 1
	22	g <- 1
	23	for(j in 1:col){
	24	if("!Sample_source_name_ch1"==mat[1,j]){
	25	colnames(mat)[j] <- "Brain_Region"
	26	}
	27	if("!Sample_title" == mat[1,j]){
	28	colnames(mat)[j] <- "Title"
	29	}
	30	if("!Sample_geo_accession" == mat[1,j]){
	31	colnames(mat)[j] <- "ID_REF"
	32	} else{
	33	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
	34	colnames(mat)[j] <- paste0("Sex",x)
	35	x = x + 1
	36	}
	37	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
	38	colnames(mat)[j] <- paste0("PMI",p)
	39	p = p + 1
	40	}
	41	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
	42	colnames(mat)[j] <- paste0("Age",a)
	43	a = a + 1
	44	}
	45	if(grepl("braak\|b&b",mat[2,j])==TRUE){
	46	colnames(mat)[j] <- paste0("Braak",b)
	47	b = b + 1
	48	}
	49	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control\|Normal",mat[2,j])==TRUE){
	50	colnames(mat)[j] <- paste0("Group",g)
	51	g = g + 1
	52	}
	53
	54	}
	55	j = j + 1
	56	}
	57	mat
	58	}
	59
	60	#2#Function for reorganizing information within the columns
	61	cinfo <- function(mat){
	62	col <- dim(mat)[2]
	63	j <-2
	64	for(j in 2:col){
	65	if(grepl("Group",colnames(mat)[j]) == TRUE){
	66	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
	67	}
	68	if(grepl("Age",colnames(mat)[j])==TRUE){
	69	mat[,j] <- gsub("\\D","",mat[,j])%>%
	70	as.integer()
	71	}
	72	if(grepl("Sex",colnames(mat)[j])==TRUE){
	73	mat[,j] <- gsub(".+:\\s","",mat[,j])
	74	}
	75	if(grepl("PMI",colnames(mat)[j])==TRUE){
	76	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
	77	as.numeric()
	78	}
	79	if(grepl("Braak",colnames(mat)[j])==TRUE){
	80	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
	81	as.roman()%>%
	82	as.integer()
	83	}
	84	j=j+1
	85	}
	86	mat
	87	}
	88
	89	#3#Function for labeling the gene IDs without names
	90	NAFIXING <- function(GIDNAM){
	91	row <- dim(GIDNAM)[1]
	92	i <- 1
	93	for(i in 1:row){
	94	if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE\|\|is.na(GIDNAM[i,2])==TRUE){
	95	GIDNAM[i,2] <- GIDNAM[i,1]
	96	}
	97	i <- i + 1
	98	}
	99	GIDNAM
	100	}
	101
	102	#4#Function for changing the gene ID to gene name
	103	cgeneID <- function(GeneName,DATA){
	104	colGene <- dim(GeneName)[2]
	105	j <- 1
	106	for(j in 1:colGene){
	107	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
	108	if(is.na(sum(chngsreq))==FALSE){
	109	if(sum(chngsreq) > 0){
	110	DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
	111	}
	112	}
	113	j = j+1
	114	}
	115	DATA
	116	}
	117
	118	#5#Function for adjusting the gene names
	119	gcnames <- function(DiData,usecol=1){
	120	nuruns <- dim(DiData)[2]
	121	i = 1
	122	nwnam <- rep("0",length.out=nuruns)
	123	for(i in 1:nuruns){
	124	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
	125	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
	126	} else{
	127	nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
	128	}
	129
	130	}
	131	nwnam
	132
	133	}
	134
	135	#6# Function for discretizing the data
	136	dndat <- function(NDATA){
	137	rownd <- dim(NDATA)[1]
	138	colnd <- dim(NDATA)[2]
	139	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	140	colnames(DDATA) <- colnames(NDATA)
	141	i <- 1
	142	for(i in 1:rownd){
	143	j <- 1
	144	for(j in 1:colnd){
	145	if(is.na(NDATA[i,j])==FALSE){
	146
	147	if(NDATA[i,j] < -1){
	148	DDATA[i,j]=0L
	149	}
	150	if(NDATA[i,j] > 1){
	151	DDATA[i,j]=2L
	152	}
	153	if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	154	DDATA[i,j]=1L
	155	}
	156	} else{
	157	DDATA[i,j] = NDATA[i,j]
	158	}
	159	j = j + 1
	160	}
	161	i = i + 1
	162	}
	163	DDATA
	164	}
	165
	166
	167	#MajorFunction#This is the function that does everything else
	168	THEFT <- function(){
	169	#Set working directory based on the directory of the series matrix file Currently only works for windows
	170	wd <- getwd()
	171	#list.files()
	172	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
	173	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
	174	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
	175
	176	#ALL DATA FILES WILL BE CLEANED
	177	if(numDAT == 1){
	178	#indexing the data files
	179	n <- 1
	180	for(n in 1: length(GSEfileloc)){
	181	alz <- list.files()[GSEfileloc[n]]
	182
	183	#Working with the wordy part of the document
	184	alzword <- alz %>%
	185	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	186	filter(grepl("!Sample",X1))%>%
	187	filter(!grepl("!Sample_contact",X1))
	188
	189	#Getting the GPL file
	190	genena <- grep("_platform_id",alzword$X1) %>%
	191	alzword$X2[.] %>%
	192	str_trim(.) %>%
	193	paste0("^",.) %>%
	194	grep(.,list.files()) %>%
	195	list.files()[.]
	196
	197	#Find out if it is a soft GPL file or not
	198	soft <- strsplit(genena,"[\\\|/]") %>%
	199	.[[1]] %>%
	200	.[length(.)] %>%
	201	grepl("soft",.)
	202
	203	##Changing row names and column names:
	204	ALZWORD <- t(alzword)
	205	rownames(ALZWORD)=NULL
	206	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	207	ALZWORD <- chngrownm(ALZWORD)[-1,]
	208	ALZWORD <- ALZWORD%>%
	209	as.data.frame()%>%
	210	dplyr::select(-starts_with("col"))
	211
	212	##Reorganizing information within the columns and final clinical data
	213	ALZWORDF <- cinfo(ALZWORD)
	214
	215
	216	#Working with Actual Data part of file
	217	alzdat <- alz %>%
	218	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	219	ALZDAT <- t(alzdat[,-1])
	220	rownames(ALZDAT)=NULL
	221
	222	##Is there a clean version of the GPL file available?
	223	gplnum <- strsplit(genena,"[\\\|/]") %>%
	224	.[[1]] %>%
	225	.[length(.)] %>%
	226	gsub("\\D","",.)
	227	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
	228	if(clfileex >= 1){
	229	#use the clean version
	230	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
	231	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
	232
	233	}
	234	if(clfileex == 0){
	235	##Lets Create a clean version
	236
	237	##Gene ID to Gene Name
	238	if(soft == TRUE){
	239	#Check to see if there is already a file containing information on soft files
	240	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	241	if(fileex == 1){
	242	#Check to see if this GPL soft file has been used before
	243	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	244	.$GPL_FILE_NUM%>%
	245	grepl(gplnum,.) %>%
	246	sum()
	247	if(IDF == 1){
	248	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	249	.$GPL_FILE_NUM%>%
	250	grep(gplnum,.)
	251	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	252	.$LOC_ID %>%
	253	.[IDLOCAL]
	254	geneIDNam <- genena %>%
	255	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	256	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	257	}
	258	if(IDF == 0){
	259	#No information on this particular GPL file
	260	idLOCGPL <- genena %>%
	261	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	262	t(.) %>%
	263	grep("^ID\\s*$",.) %>%
	264	-1
	265	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	266	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	267	geneIDNam <- genena %>%
	268	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	269	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	270	}
	271	}
	272	if(fileex == 0){
	273	#We must create a file that we can access for later use
	274	idLOCGPL <- genena %>%
	275	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	276	t(.) %>%
	277	grep("^ID\\s*$",.) %>%
	278	-1
	279	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	280	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	281	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	282	geneIDNam <- genena %>%
	283	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	284	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	285	}
	286	}
	287	if(soft == FALSE){
	288	geneIDNam <- genena %>%
	289	read_delim(delim="\t",comment = "#")%>%
	290	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	291	}
	292
	293	##Labeling the gene IDs without names
	294	geneIDNam <- NAFIXING(geneIDNam)
	295
	296	##remove the whitespace
	297	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	298
	299	##Here is the clean version
	300	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
	301	}
	302
	303
	304
	305	##Changing the gene ID to gene name
	306	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	307	colnames(ALZDAT) = ALZDAT1[1,]
	308
	309
	310	##Adjusting the column names aka the gene names
	311	colnames(ALZDAT) <- gcnames(ALZDAT)
	312
	313
	314	#Full RAW Data
	315	Fullalzdwr <- ALZDAT %>%
	316	as.data.frame() %>%
	317	cbind(ALZWORDF,.)
	318
	319	#Raw file is output
	320	nfnaex <- strsplit(alz,"[\\]") %>%
	321	.[[1]] %>%
	322	.[length(.)] %>%
	323	gsub("\\D","",.) %>%
	324	c("GSE",.,"aftexcel.txt") %>%
	325	paste(collapse = "")
	326	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
	327
	328
	329
	330	#Now for the discretization part
	331	##get the wordy part again
	332	rawword <- t(ALZWORDF)
	333
	334	##where is ID_REF located
	335	hereim <- grep("ID_REF",rownames(rawword))
	336
	337	##Subject Names GSM...
	338	subjnam <- rawword[hereim,]
	339
	340	##Getting the names for the rows
	341	namedarows <- rownames(rawword)[-hereim] %>%
	342	as.data.frame()
	343	RAWWORD <- rawword[-hereim,] %>%
	344	as.data.frame() %>%
	345	bind_cols(namedarows,.)
	346	z <- 1
	347	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
	348	for(z in 1:dim(RAWWORD)[1]){
	349	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	350	z <- z + 1
	351	}
	352
	353	colnames(naroww) <- "ROW_NAs"
	354	RAWWORD <- bind_cols(RAWWORD,naroww)
	355
	356
	357	roALZna <- t(ALZDAT) %>%
	358	rownames(.) %>%
	359	as.data.frame(.)
	360	colnames(roALZna) <- "ID_REF"
	361
	362	RAWDAT <- t(ALZDAT) %>%
	363	as.data.frame(.)
	364	colnames(RAWDAT) <- NULL
	365	rownames(RAWDAT) <- NULL
	366
	367	RAWDAT2 <- RAWDAT %>%
	368	cbind(roALZna,.) %>%
	369	dplyr::arrange(.,ID_REF)
	370
	371	##Editing the file for R processing
	372	RAWDATID <- RAWDAT2[,1] %>%
	373	as.matrix(.)
	374
	375	RAWDATNUM <- RAWDAT2[,-1] %>%
	376	mapply(.,FUN = as.numeric) %>%
	377	t(.)
	378
	379	##Consolidating genes with the same name
	380	###create empty matrix of size equal to tabRDATID
	381	tabRDATID <- table(RAWDATID)
	382	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
	383	j <- 1
	384	for(j in 1:length(tabRDATID)){
	385	##Putting the ones without duplicates in their new homes
	386	if(tabRDATID[j] == 1){
	387	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	388	}
	389	##Averaging duplicates and putting them in their new homes
	390	if(tabRDATID[j] > 1){
	391	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	392	}
	393	j <- j + 1
	394	}
	395
	396	##Scaling the Data
	397	scrawdat <- NuRDATN%>%
	398	scale()
	399	attr(scrawdat,"scaled:center") <- NULL
	400	attr(scrawdat,"scaled:scale") <- NULL
	401	colnames(scrawdat) <- rownames(tabRDATID)
	402
	403	##Discretized the Data
	404	dialzdat <- scrawdat %>%
	405	dndat(.) %>%
	406	t()%>%
	407	as.data.frame(.)
	408	colnames(dialzdat) <- rownames(RAWDATNUM)
	409
	410	##setting "ID_REF" as a new variable
	411	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
	412	colnames(geneNAM) <- "ID_REF"
	413	rownames(dialzdat) <- NULL
	414	dialzdat <-bind_cols(geneNAM,dialzdat)
	415
	416	##NAs in a column
	417	x <- 2
	418	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
	419	nacol[1,1] = "COL_NAs"
	420	for(x in 2:dim(dialzdat)[2]){
	421	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	422	x <- x + 1
	423	}
	424	colnames(nacol) <- colnames(dialzdat)
	425	dialzdat <- bind_rows(dialzdat,nacol)
	426
	427	##NAs in a row
	428	y <- 1
	429	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
	430	for(y in 1:dim(dialzdat)[1]){
	431	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	432	y <- y + 1
	433	}
	434	colnames(narowd) <- "ROW_NAs"
	435	dialzdat <- bind_cols(dialzdat,narowd)
	436	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
	437	colnames(RAWWORD) <- colnames(dialzdat)
	438	##converting to character so that the clinical can be brought together with discrete data
	439	k <- 2
	440	for(k in 2:dim(dialzdat)[2]-1){
	441	dialzdat[,k] <- as.character(dialzdat[,k])
	442	k <- k + 1
	443	}
	444	#The End the full data
	445	Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
	446
	447	#Produces Discrete file
	448	nfnaex2 <- strsplit(alz,"[\\\|/]") %>%
	449	.[[1]] %>%
	450	.[length(.)] %>%
	451	gsub("\\D","",.) %>%
	452	c("GSE",.,"dscrt.txt") %>%
	453	paste(collapse = "")
	454	write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
	455	n <- n +1
	456	}
	457	}
	458
	459	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
	460	if(numDAT == 2){
	461	#All the files you want to analyze
	462	ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
	463	if(length(ANDIS) == 0){
	464	#Spit out a warning
	465	warning("You did not select any files and so no cleaning will be performed")
	466	} else{
	467	#indexing the data files
	468	n <- 1
	469	for(n in 1: length(ANDIS)){
	470	alz <- ANDIS[n]
	471
	472	#Working with the wordy part of the document
	473	alzword <- alz %>%
	474	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
	475	filter(grepl("!Sample",X1))%>%
	476	filter(!grepl("!Sample_contact",X1))
	477
	478	#Getting the GPL file
	479	genena <- grep("_platform_id",alzword$X1) %>%
	480	alzword$X2[.] %>%
	481	str_trim(.) %>%
	482	paste0("^",.) %>%
	483	grep(.,list.files()) %>%
	484	list.files()[.]
	485
	486	#Find out if it is a soft GPL file or not
	487	soft <- strsplit(genena,"[\\\|/]") %>%
	488	.[[1]] %>%
	489	.[length(.)] %>%
	490	grepl("soft",.)
	491
	492	##Changing row names and column names:
	493	ALZWORD <- t(alzword)
	494	rownames(ALZWORD)=NULL
	495	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
	496	ALZWORD <- chngrownm(ALZWORD)[-1,]
	497	ALZWORD <- ALZWORD%>%
	498	as.data.frame()%>%
	499	dplyr::select(-starts_with("col"))
	500
	501	##Reorganizing information within the columns and final clinical data
	502	ALZWORDF <- cinfo(ALZWORD)
	503
	504
	505	#Working with Actual Data part of file
	506	alzdat <- alz %>%
	507	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
	508	ALZDAT <- t(alzdat[,-1])
	509	rownames(ALZDAT)=NULL
	510
	511	##Is there a clean version of the GPL file available?
	512	gplnum <- strsplit(genena,"[\\\|/]") %>%
	513	.[[1]] %>%
	514	.[length(.)] %>%
	515	gsub("\\D","",.)
	516	clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
	517	if(clfileex >= 1){
	518	#use the clean version
	519	geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%
	520	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
	521
	522	}
	523	if(clfileex == 0){
	524	##Lets Create a clean version
	525
	526	##Gene ID to Gene Name
	527	if(soft == TRUE){
	528	#Check to see if there is already a file containing information on soft files
	529	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
	530	if(fileex == 1){
	531	#Check to see if this GPL soft file has been used before
	532	IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	533	.$GPL_FILE_NUM%>%
	534	grepl(gplnum,.) %>%
	535	sum()
	536	if(IDF == 1){
	537	IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	538	.$GPL_FILE_NUM%>%
	539	grep(gplnum,.)
	540	idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
	541	.$LOC_ID %>%
	542	.[IDLOCAL]
	543	geneIDNam <- genena %>%
	544	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
	545	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	546	}
	547	if(IDF == 0){
	548	#No information on this particular GPL file
	549	idLOCGPL <- genena %>%
	550	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	551	t(.) %>%
	552	grep("^ID\\s*$",.) %>%
	553	-1
	554	cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
	555	cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
	556	geneIDNam <- genena %>%
	557	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	558	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	559	}
	560	}
	561	if(fileex == 0){
	562	#We must create a file that we can access for later use
	563	idLOCGPL <- genena %>%
	564	read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
	565	t(.) %>%
	566	grep("^ID\\s*$",.) %>%
	567	-1
	568	Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
	569	colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
	570	write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
	571	geneIDNam <- genena %>%
	572	read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
	573	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	574	}
	575	}
	576	if(soft == FALSE){
	577	geneIDNam <- genena %>%
	578	read_delim(delim="\t",comment = "#")%>%
	579	dplyr::select(.,ID,grep("Symbol\|^ORF\\s$\|^gene_assignment\\s$",colnames(.)))
	580	}
	581
	582	##Labeling the gene IDs without names
	583	geneIDNam <- NAFIXING(geneIDNam)
	584
	585	##remove the whitespace
	586	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
	587
	588	##Here is the clean version
	589	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
	590	}
	591
	592
	593
	594	##Changing the gene ID to gene name
	595	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
	596	colnames(ALZDAT) = ALZDAT1[1,]
	597
	598
	599	##Adjusting the column names aka the gene names
	600	colnames(ALZDAT) <- gcnames(ALZDAT)
	601
	602
	603	#Full RAW Data
	604	Fullalzdwr <- ALZDAT %>%
	605	as.data.frame() %>%
	606	cbind(ALZWORDF,.)
	607
	608	#Raw file is output
	609	nfnaex <- strsplit(alz,"[\\]") %>%
	610	.[[1]] %>%
	611	.[length(.)] %>%
	612	gsub("\\D","",.) %>%
	613	c("GSE",.,"aftexcel.txt") %>%
	614	paste(collapse = "")
	615	write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
	616
	617
	618
	619	#Now for the discretization part
	620	##get the wordy part again
	621	rawword <- t(ALZWORDF)
	622
	623	##where is ID_REF located
	624	hereim <- grep("ID_REF",rownames(rawword))
	625
	626	##Subject Names GSM...
	627	subjnam <- rawword[hereim,]
	628
	629	##Getting the names for the rows
	630	namedarows <- rownames(rawword)[-hereim] %>%
	631	as.data.frame()
	632	RAWWORD <- rawword[-hereim,] %>%
	633	as.data.frame() %>%
	634	bind_cols(namedarows,.)
	635	z <- 1
	636	naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
	637	for(z in 1:dim(RAWWORD)[1]){
	638	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	639	z <- z + 1
	640	}
	641
	642	colnames(naroww) <- "ROW_NAs"
	643	RAWWORD <- bind_cols(RAWWORD,naroww)
	644
	645
	646	roALZna <- t(ALZDAT) %>%
	647	rownames(.) %>%
	648	as.data.frame(.)
	649	colnames(roALZna) <- "ID_REF"
	650
	651	RAWDAT <- t(ALZDAT) %>%
	652	as.data.frame(.)
	653	colnames(RAWDAT) <- NULL
	654	rownames(RAWDAT) <- NULL
	655
	656	RAWDAT2 <- RAWDAT %>%
	657	cbind(roALZna,.) %>%
	658	dplyr::arrange(.,ID_REF)
	659
	660	##Editing the file for R processing
	661	RAWDATID <- RAWDAT2[,1] %>%
	662	as.matrix(.)
	663
	664	RAWDATNUM <- RAWDAT2[,-1] %>%
	665	mapply(.,FUN = as.numeric) %>%
	666	t(.)
	667
	668	##Consolidating genes with the same name
	669	###create empty matrix of size equal to tabRDATID
	670	tabRDATID <- table(RAWDATID)
	671	NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
	672	j <- 1
	673	for(j in 1:length(tabRDATID)){
	674	##Putting the ones without duplicates in their new homes
	675	if(tabRDATID[j] == 1){
	676	NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	677	}
	678	##Averaging duplicates and putting them in their new homes
	679	if(tabRDATID[j] > 1){
	680	NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	681	}
	682	j <- j + 1
	683	}
	684
	685	##Scaling the Data
	686	scrawdat <- NuRDATN%>%
	687	scale()
	688	attr(scrawdat,"scaled:center") <- NULL
	689	attr(scrawdat,"scaled:scale") <- NULL
	690	colnames(scrawdat) <- rownames(tabRDATID)
	691
	692	##Discretized the Data
	693	dialzdat <- scrawdat %>%
	694	dndat(.) %>%
	695	t()%>%
	696	as.data.frame(.)
	697	colnames(dialzdat) <- rownames(RAWDATNUM)
	698
	699	##setting "ID_REF" as a new variable
	700	geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
	701	colnames(geneNAM) <- "ID_REF"
	702	rownames(dialzdat) <- NULL
	703	dialzdat <-bind_cols(geneNAM,dialzdat)
	704
	705	##NAs in a column
	706	x <- 2
	707	nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
	708	nacol[1,1] = "COL_NAs"
	709	for(x in 2:dim(dialzdat)[2]){
	710	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	711	x <- x + 1
	712	}
	713	colnames(nacol) <- colnames(dialzdat)
	714	dialzdat <- bind_rows(dialzdat,nacol)
	715
	716	##NAs in a row
	717	y <- 1
	718	narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
	719	for(y in 1:dim(dialzdat)[1]){
	720	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	721	y <- y + 1
	722	}
	723	colnames(narowd) <- "ROW_NAs"
	724	dialzdat <- bind_cols(dialzdat,narowd)
	725	colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
	726	colnames(RAWWORD) <- colnames(dialzdat)
	727	##converting to character so that the clinical can be brought together with discrete data
	728	k <- 2
	729	for(k in 2:dim(dialzdat)[2]-1){
	730	dialzdat[,k] <- as.character(dialzdat[,k])
	731	k <- k + 1
	732	}
	733	#The End the full data
	734	Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
	735
	736	#Produces Discrete file
	737	nfnaex2 <- strsplit(alz,"[\\\|/]") %>%
	738	.[[1]] %>%
	739	.[length(.)] %>%
	740	gsub("\\D","",.) %>%
	741	c("GSE",.,"dscrt.txt") %>%
	742	paste(collapse = "")
	743	write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
	744
	745
	746	n <- n + 1
	747	}
	748	}
	749	}
	750	}
	751	#The Rest of this code will be used every time you want to change a data set
	752	THEFT()