Efrain Gonzalez / Cleaning and Fixing Data with R

Browse Code »

Commit 16b4f55de1bee74c9f9060e6884d4d27c04cfe45

Authored by Efrain Gonzalez 2017-05-26 12:18:30 -0400

1 parent a66a63dc50

Exists in master

wrong extension

Showing 1 changed file with 0 additions and 198 deletions Show diff stats

Rclean.txt

View file @ 16b4f55

1	#Libraries required to run the code		File was deleted
2	library(MASS)
3	library(pryr)
4	library(dplyr)
5	library(tidyr)
6	library(readr)
7	library(stringr)
8
9
10	#Necessary Functions
11	#1#Function for handling the changing of row names and column names
12	chngrownm <- function(mat){
13	row <- dim(mat)[1]
14	col <- dim(mat)[2]
15	j <- 1
16	x <- 1
17	p <- 1
18	a <- 1
19	b <- 1
20	g <- 1
21	for(j in 1:col){
22	if("!Sample_source_name_ch1"==mat[1,j]){
23	colnames(mat)[j] <- "Brain_Region"
24	}
25	if("!Sample_title" == mat[1,j]){
26	colnames(mat)[j] <- "Title"
27	}
28	if("!Sample_geo_accession" == mat[1,j]){
29	colnames(mat)[j] <- "ID_REF"
30	} else{
31	if(grepl("Sex\|gender\|Gender\|sex",mat[2,j])==TRUE){
32	colnames(mat)[j] <- paste0("Sex",x)
33	x = x + 1
34	}
35	if(grepl("postmorteminterval\|PMI\|pmi",mat[2,j])==TRUE){
36	colnames(mat)[j] <- paste0("PMI",p)
37	p = p + 1
38	}
39	if(grepl("age\|Age\|AGE",mat[2,j])==TRUE){
40	colnames(mat)[j] <- paste0("Age",a)
41	a = a + 1
42	}
43	if(grepl("braak\|b&b",mat[2,j])==TRUE){
44	colnames(mat)[j] <- paste0("Braak",b)
45	b = b + 1
46	}
47	if(grepl("group\|disease\|control\|AD\|normal\|diagnosis\|Alzheimer\|Control",mat[2,j])==TRUE){
48	colnames(mat)[j] <- paste0("Group",g)
49	g = g + 1
50	}
51
52	}
53	j = j + 1
54	}
55	mat
56	}
57
58	#2#Function for reorganizing information within the columns
59	cinfo <- function(mat){
60	col <- dim(mat)[2]
61	j <-2
62	for(j in 2:col){
63	if(grepl("Group",colnames(mat)[j]) == TRUE){
64	mat[,j] <- gsub(".+:\\s\|\\s.+;.+","",mat[,j])
65	}
66	if(grepl("Age",colnames(mat)[j])==TRUE){
67	mat[,j] <- gsub("\\D","",mat[,j])%>%
68	as.integer()
69	}
70	if(grepl("Sex",colnames(mat)[j])==TRUE){
71	mat[,j] <- gsub(".+:\\s","",mat[,j])
72	}
73	if(grepl("PMI",colnames(mat)[j])==TRUE){
74	mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
75	as.numeric()
76	}
77	if(grepl("Braak",colnames(mat)[j])==TRUE){
78	mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
79	as.roman()%>%
80	as.integer()
81	}
82	j=j+1
83	}
84	mat
85	}
86
87	#3#Function for changing the gene ID to gene name
88	cgeneID <- function(GeneName,DATA){
89	colGene <- dim(GeneName)[2]
90	j <- 1
91	for(j in 1:colGene){
92	chngsreq <- grep(GeneName[1,j],DATA[1,])
93	DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
94	j = j+1
95	}
96	DATA
97	}
98
99	#4#Function for adjusting the gene names
100	gcnames <- function(DiData,usecol=1){
101	nuruns <- dim(DiData)[2]
102	i = 1
103	nwnam <- rep("0",length.out=nuruns)
104	for(i in 1:nuruns){
105	if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
106	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
107	} else{
108	nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
109	}
110
111	}
112	nwnam
113
114	}
115
116
117
118	#The Rest of this code will be used every time you want to change a data set
119
120	#Getting the series matrix file
121	print("Choose the series matrix file that you want to Analyze")
122	alz <- file.choose()
123
124	#Getting the GPL file
125	print("Choose the GPL file that correlates with the above series matrix file")
126	genena <- file.choose()
127
128
129	#Set working directory based on the directory of the series matrix file
130	##strsplit(alz,"[\\]") %>%
131	## .[[1]] %>%
132	## .[-length(.)] %>%
133	## paste(.,collapse="/") %>%
134	## setwd()
135
136
137	#Working with the wordy part of the document
138	alzword <- alz %>%
139	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
140	filter(grepl("!Sample",X1))%>%
141	filter(!grepl("!Sample_contact",X1))
142
143	##Changing row names and column names:
144	ALZWORD <- t(alzword)
145	rownames(ALZWORD)=NULL
146	colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
147	ALZWORD <- chngrownm(ALZWORD)[-1,]
148	ALZWORD <- ALZWORD%>%
149	as.data.frame()%>%
150	dplyr::select(-starts_with("col"))
151
152	##Reorganizing information within the columns
153	ALZWORDF <- cinfo(ALZWORD)
154
155
156	#Working with Actual Data part of file
157	alzdat <- alz %>%
158	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
159	ALZDAT <- t(alzdat[,-1])
160	rownames(ALZDAT)=NULL
161
162
163	##Gene ID to Gene Name
164	geneIDNam <- genena %>%
165	read_delim(delim="\t",comment = "#")%>%
166	dplyr::select(.,ID,grep("Symbol\|ORF",colnames(.)))
167
168	##Changing the ID to a Name
169	ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
170	colnames(ALZDAT) = ALZDAT1[1,]
171
172
173	##Adjusting the column names aka the gene names
174	colnames(ALZDAT) <- gcnames(ALZDAT)
175
176
177	#Full Data
178	Fullalzdw <- ALZDAT %>%
179	as.data.frame() %>%
180	cbind(ALZWORDF,.)
181
182	##since the order in which the packages are added matters I moved this package to the top
183	##library(MASS)
184	nfna <- strsplit(alz,"[\\]") %>%
185	.[[1]] %>%
186	.[length(.)] %>%
187	gsub("\\D","",.) %>%
188	c("GSE",.,"after.txt") %>%
189	paste(collapse = "")
190	MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")
191	#Perfect for excel viewing
192	nfnaex <- strsplit(alz,"[\\]") %>%
193	.[[1]] %>%
194	.[length(.)] %>%
195	gsub("\\D","",.) %>%
196	c("GSE",.,"aftexcel.txt") %>%
197	paste(collapse = "")
198	write.table(t(Fullalzdw), file = nfnaex, sep = "\t")
199		1	#Libraries required to run the code