Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(pryr)

2

library(pryr)

3

library(MASS)

3

library(MASS)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

for(i in 1:row){

91

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

93

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

94

}

95

i <- i + 1

95

i <- i + 1

96

}

96

}

97

GIDNAM

97

GIDNAM

98

}

98

}

99

100

#4#Function for changing the gene ID to gene name

100

#4#Function for changing the gene ID to gene name

101

cgeneID <- function(GeneName,DATA){

101

cgeneID <- function(GeneName,DATA){

102

colGene <- dim(GeneName)[2]

102

colGene <- dim(GeneName)[2]

103

j <- 1

103

j <- 1

104

for(j in 1:colGene){

104

for(j in 1:colGene){

105

chngsreq <- grep(GeneName[1,j],DATA[1,])

105

chngsreq <- grep(GeneName[1,j],DATA[1,])

106

#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

106

#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

107

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

107

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

108

j = j+1

108

j = j+1

109

}

109

}

110

DATA

110

DATA

111

}

111

}

112

113

#5#Function for adjusting the gene names

113

#5#Function for adjusting the gene names

114

gcnames <- function(DiData,usecol=1){

114

gcnames <- function(DiData,usecol=1){

115

nuruns <- dim(DiData)[2]

115

nuruns <- dim(DiData)[2]

116

i = 1

116

i = 1

117

nwnam <- rep("0",length.out=nuruns)

117

nwnam <- rep("0",length.out=nuruns)

118

for(i in 1:nuruns){

118

for(i in 1:nuruns){

119

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

119

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

120

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

120

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

121

} else{

121

} else{

122

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

122

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

123

}

123

}

124

125

}

125

}

126

nwnam

126

nwnam

127

128

}

128

}

129

130

131

132

#The Rest of this code will be used every time you want to change a data set

132

#The Rest of this code will be used every time you want to change a data set

133

134

#Getting the series matrix file

134

#Getting the series matrix file

135

print("Choose the series matrix file that you want to Analyze")

135

print("Choose the series matrix file that you want to Analyze")

136

alz <- file.choose()

136

alz <- file.choose()

137

138

#Getting the GPL file

138

#Getting the GPL file

139

print("Choose the GPL file that correlates with the above series matrix file")

139

print("Choose the GPL file that correlates with the above series matrix file")

140

genena <- file.choose()

140

genena <- file.choose()

141

142

143

#Set working directory based on the directory of the series matrix file Currently only works for windows

143

#Set working directory based on the directory of the series matrix file Currently only works for windows

144

##strsplit(alz,"[\\]") %>%

144

##strsplit(alz,"[\\]") %>%

145

## .[[1]] %>%

145

## .[[1]] %>%

146

## .[-length(.)] %>%

146

## .[-length(.)] %>%

147

## paste(.,collapse="/") %>%

147

## paste(.,collapse="/") %>%

148

## setwd()

148

## setwd()

149

150

#Find out if it is a soft GPL file or not

150

#Find out if it is a soft GPL file or not

151

soft <- strsplit(genena,"[\\|/]") %>%

151

soft <- strsplit(genena,"[\\|/]") %>%

152

.[[1]] %>%

152

.[[1]] %>%

153

.[length(.)] %>%

153

.[length(.)] %>%

154

grepl("soft",.)

154

grepl("soft",.)

155

156

#Working with the wordy part of the document

156

#Working with the wordy part of the document

157

alzword <- alz %>%

157

alzword <- alz %>%

158

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

158

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

159

filter(grepl("!Sample",X1))%>%

159

filter(grepl("!Sample",X1))%>%

160

filter(!grepl("!Sample_contact",X1))

160

filter(!grepl("!Sample_contact",X1))

161

162

##Changing row names and column names:

162

##Changing row names and column names:

163

ALZWORD <- t(alzword)

163

ALZWORD <- t(alzword)

164

rownames(ALZWORD)=NULL

164

rownames(ALZWORD)=NULL

165

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

165

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

166

ALZWORD <- chngrownm(ALZWORD)[-1,]

166

ALZWORD <- chngrownm(ALZWORD)[-1,]

167

ALZWORD <- ALZWORD%>%

167

ALZWORD <- ALZWORD%>%

168

as.data.frame()%>%

168

as.data.frame()%>%

169

dplyr::select(-starts_with("col"))

169

dplyr::select(-starts_with("col"))

170

171

##Reorganizing information within the columns

171

##Reorganizing information within the columns

172

ALZWORDF <- cinfo(ALZWORD)

172

ALZWORDF <- cinfo(ALZWORD)

173

174

175

#Working with Actual Data part of file

175

#Working with Actual Data part of file

176

alzdat <- alz %>%

176

alzdat <- alz %>%

177

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

177

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

178

ALZDAT <- t(alzdat[,-1])

178

ALZDAT <- t(alzdat[,-1])

179

rownames(ALZDAT)=NULL

179

rownames(ALZDAT)=NULL

180

181

##Is there a clean version of the GPL file available?

181

##Is there a clean version of the GPL file available?

182

gplnum <- strsplit(genena,"[\\|/]") %>%

182

gplnum <- strsplit(genena,"[\\|/]") %>%

183

.[[1]] %>%

183

.[[1]] %>%

184

.[length(.)] %>%

184

.[length(.)] %>%

185

gsub("\\D","",.)

185

gsub("\\D","",.)

186

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

186

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

187

if(clfileex >= 1){

187

if(clfileex >= 1){

188

#use the clean version

188

#use the clean version

189

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

189

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

190

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

190

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

191

192

}

192

}

193

if(clfileex == 0){

193

if(clfileex == 0){

194

##Lets Create a clean version

194

##Lets Create a clean version

195

196

##Gene ID to Gene Name

196

##Gene ID to Gene Name

197

###geneIDNam <- genena %>%

197

###geneIDNam <- genena %>%

198

### read_delim(delim="\t",comment = "#")%>%

198

### read_delim(delim="\t",comment = "#")%>%

199

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

199

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

200

###problems with the above for soft files

200

###problems with the above for soft files

201

if(soft == TRUE){

201

if(soft == TRUE){

202

#gplnum <- strsplit(genena,"[\\|/]") %>%

202

#gplnum <- strsplit(genena,"[\\|/]") %>%

203

# .[[1]] %>%

203

# .[[1]] %>%

204

# .[length(.)] %>%

204

# .[length(.)] %>%

205

# gsub("\\D","",.)

205

# gsub("\\D","",.)

206

#Check to see if there is already a file containing information on soft files

206

#Check to see if there is already a file containing information on soft files

207

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

207

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

208

if(fileex == 1){

208

if(fileex == 1){

209

#Check to see if this GPL soft file has been used before

209

#Check to see if this GPL soft file has been used before

210

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

210

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

211

.$GPL_FILE_NUM%>%

211

.$GPL_FILE_NUM%>%

212

grepl(gplnum,.) %>%

212

grepl(gplnum,.) %>%

213

sum()

213

sum()

214

if(IDF == 1){

214

if(IDF == 1){

215

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

215

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

216

.$GPL_FILE_NUM%>%

216

.$GPL_FILE_NUM%>%

217

grep(gplnum,.)

217

grep(gplnum,.)

218

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

218

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

219

.$LOC_ID %>%

219

.$LOC_ID %>%

220

.[IDLOCAL]

220

.[IDLOCAL]

221

geneIDNam <- genena %>%

221

geneIDNam <- genena %>%

222

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

222

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

223

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

223

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

224

}

224

}

225

if(IDF == 0){

225

if(IDF == 0){

226

#No information on this particular GPL file

226

#No information on this particular GPL file

227

idLOCGPL <- genena %>%

227

idLOCGPL <- genena %>%

228

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

228

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

229

t(.) %>%

229

t(.) %>%

230

grep("^\\D",.) %>%

230

grep("^\\D",.) %>%

231

length()-1

231

length()-1

232

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

232

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

233

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

233

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

234

geneIDNam <- genena %>%

234

geneIDNam <- genena %>%

235

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

235

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

236

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

236

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

237

}

237

}

238

}

238

}

239

if(fileex == 0){

239

if(fileex == 0){

240

#We must create a file that we can access for later use

240

#We must create a file that we can access for later use

241

idLOCGPL <- genena %>%

241

idLOCGPL <- genena %>%

242

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

242

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

243

t(.) %>%

243

t(.) %>%

244

grep("^\\D",.) %>%

244

grep("^\\D",.) %>%

245

length()-1

245

length()-1

246

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

246

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

247

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

247

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

248

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

248

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

249

geneIDNam <- genena %>%

249

geneIDNam <- genena %>%

250

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

250

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

251

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

251

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

252

}

252

}

253

}

253

}

254

if(soft == FALSE){

254

if(soft == FALSE){

255

geneIDNam <- genena %>%

255

geneIDNam <- genena %>%

256

read_delim(delim="\t",comment = "#")%>%

256

read_delim(delim="\t",comment = "#")%>%

257

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

257

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

258

}

258

}

259

260

##Labeling the gene IDs without names

260

##Labeling the gene IDs without names

261

geneIDNam <- NAFIXING(geneIDNam)

261

geneIDNam <- NAFIXING(geneIDNam)

262

263

##remove the whitespace

263

##remove the whitespace

264

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

264

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

265

266

##Here is the clean version

266

##Here is the clean version

267

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

267

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

268

}

268

}

269

270

271

272

##Changing the gene ID to gene name

272

##Changing the gene ID to gene name

273

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

273

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

274

colnames(ALZDAT) = ALZDAT1[1,]

274

colnames(ALZDAT) = ALZDAT1[1,]

275

276

277

##Adjusting the column names aka the gene names

277

##Adjusting the column names aka the gene names

278

colnames(ALZDAT) <- gcnames(ALZDAT)

278

colnames(ALZDAT) <- gcnames(ALZDAT)

279

280

281

#Full Data

281

#Full Data

282

Fullalzdw <- ALZDAT %>%

282

Fullalzdw <- ALZDAT %>%

283

as.data.frame() %>%

283

as.data.frame() %>%

284

cbind(ALZWORDF,.)

284

cbind(ALZWORDF,.)

285

286

287

#nfna <- strsplit(alz,"[\\|/]") %>%

287

#nfna <- strsplit(alz,"[\\|/]") %>%

288

# .[[1]] %>%

288

# .[[1]] %>%

289

# .[length(.)] %>%

289

# .[length(.)] %>%

290

# gsub("\\D","",.) %>%

290

# gsub("\\D","",.) %>%

291

# c("GSE",.,"after.txt") %>%

291

# c("GSE",.,"after.txt") %>%

292

# paste(collapse = "")

292

# paste(collapse = "")

293

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

293

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

294

295

#Perfect for excel viewing

295

#Perfect for excel viewing

296

nfnaex <- strsplit(alz,"[\\]") %>%

296

nfnaex <- strsplit(alz,"[\\]") %>%

297

.[[1]] %>%

297

.[[1]] %>%

298

.[length(.)] %>%

298

.[length(.)] %>%

299

gsub("\\D","",.) %>%

299

gsub("\\D","",.) %>%

300

c("GSE",.,"aftexcel.txt") %>%

300

c("GSE",.,"aftexcel.txt") %>%

301

paste(collapse = "")

301

paste(collapse = "")

302

write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)

302

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Updated RClean3

 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	j <- 1
 	x <- 1
 	p <- 1
 	a <- 1
 	b <- 1
 	g <- 1
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"
 		}
 		if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
 		}
 		if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Sex",x)
 				x = x + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("PMI",p)
 				p = p + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Age",a)
 				a = a + 1
 			 }
 			if(grepl("braak|b&b",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Braak",b)
 				b = b + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Group",g)
 				g = g + 1
 			}
 		}
 		j = j + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
 		if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
 		if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
 		if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		}
 		if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
 	colGene <- dim(GeneName)[2]
 	j <- 1
 	for(j in 1:colGene){
 		chngsreq <- grep(GeneName[1,j],DATA[1,])
 		#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
 		DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 		j = j+1
 	}
 	DATA
 }
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
 		} else{
 			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
 		}
 	}
 	nwnam
 }
 #The Rest of this code will be used every time you want to change a data set
 #Getting the series matrix file
 print("Choose the series matrix file that you want to Analyze")
 alz <- file.choose()
 #Getting the GPL file
 print("Choose the GPL file that correlates with the above series matrix file")
 genena <- file.choose()
 #Set working directory based on the directory of the series matrix file Currently only works for windows
 ##strsplit(alz,"[\\]") %>%
 ##	.[[1]] %>%
 ##	.[-length(.)] %>%
 ##	paste(.,collapse="/") %>%
 ##	setwd()
 #Find out if it is a soft GPL file or not
 soft <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	grepl("soft",.)
 #Working with the wordy part of the document
 alzword <- alz %>%
 	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 	filter(grepl("!Sample",X1))%>%
 	filter(!grepl("!Sample_contact",X1))
 ##Changing row names and column names:
 ALZWORD <- t(alzword)
 rownames(ALZWORD)=NULL
 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 ALZWORD <- chngrownm(ALZWORD)[-1,]
 ALZWORD <- ALZWORD%>%
 	as.data.frame()%>%
 	dplyr::select(-starts_with("col"))
 ##Reorganizing information within the columns
 ALZWORDF <- cinfo(ALZWORD)
 #Working with Actual Data part of file
 alzdat <- alz %>%
 	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 ALZDAT <- t(alzdat[,-1])
 rownames(ALZDAT)=NULL
 ##Is there a clean version of the GPL file available?
 gplnum <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.)
 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 if(clfileex >= 1){
 #use the clean version
 geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 }
 if(clfileex == 0){
 ##Lets Create a clean version
 ##Gene ID to Gene Name
 ###geneIDNam <- genena %>%
 ###	read_delim(delim="\t",comment = "#")%>%
 ###	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 ###problems with the above for soft files
 	if(soft == TRUE){
 		#gplnum <- strsplit(genena,"[\\|/]") %>%
 		#	.[[1]] %>%
 		#	.[length(.)] %>%
 		#	gsub("\\D","",.)
 		#Check to see if there is already a file containing information on soft files
 		fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 		if(fileex == 1){
 			#Check to see if this GPL soft file has been used before
 			IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 				.$GPL_FILE_NUM%>%
 				grepl(gplnum,.) %>%
 				sum()
 			if(IDF == 1){
 				IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$GPL_FILE_NUM%>%
 					grep(gplnum,.)
 				idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$LOC_ID %>%
 					.[IDLOCAL]
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 					dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 			}
 			if(IDF == 0){
 				#No information on this particular GPL file
 				idLOCGPL <- genena %>%
 					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 					t(.) %>%
 					grep("^\\D",.) %>%
 					length()-1
 				cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 					cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 					dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 			}
 		}
 		if(fileex == 0){
 			#We must create a file that we can access for later use
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 				t(.) %>%
 				grep("^\\D",.) %>%
 				length()-1
 			Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 			colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 			write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 			geneIDNam <- genena %>%
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 		}
 	}
 	if(soft == FALSE){
 		geneIDNam <- genena %>%
 		read_delim(delim="\t",comment = "#")%>%
 		dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 	}
 	##Labeling the gene IDs without names
 	geneIDNam <- NAFIXING(geneIDNam)
 	##remove the whitespace
 	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 	##Here is the clean version
 	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 }
 ##Changing the gene ID to gene name
 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 colnames(ALZDAT) = ALZDAT1[1,]
 ##Adjusting the column names aka the gene names
 colnames(ALZDAT) <- gcnames(ALZDAT)
 #Full Data
 Fullalzdw <- ALZDAT %>%
 	as.data.frame() %>%
 	cbind(ALZWORDF,.)
 #nfna <- strsplit(alz,"[\\|/]") %>%
 #	.[[1]] %>%
 #	.[length(.)] %>%
 #	gsub("\\D","",.) %>%
 #	c("GSE",.,"after.txt") %>%
 #	paste(collapse = "")
 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
 #Perfect for excel viewing
 nfnaex <- strsplit(alz,"[\\]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"aftexcel.txt") %>%
 	paste(collapse = "")
-write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)
+write.table(t(Fullalzdw), file = nfnaex, sep = "\t")