Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(pryr)

2

library(pryr)

3

library(MASS)

3

library(MASS)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

x <- 1

92

for(i in 1:row){

91

for(i in 1:row){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

94

GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])

93

GIDNAM[i,2] <- GIDNAM[i,1]

95

x <- x + 1

96

}

94

}

97

i <- i + 1

95

i <- i + 1

98

}

96

}

99

GIDNAM

97

GIDNAM

100

}

98

}

101

99

102

#4#Function for changing the gene ID to gene name

100

#4#Function for changing the gene ID to gene name

103

cgeneID <- function(GeneName,DATA){

101

cgeneID <- function(GeneName,DATA){

104

colGene <- dim(GeneName)[2]

102

colGene <- dim(GeneName)[2]

105

j <- 1

103

j <- 1

106

for(j in 1:colGene){

104

for(j in 1:colGene){

107

chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])

105

chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])

108

DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

106

DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

109

j = j+1

107

j = j+1

110

}

108

}

111

DATA

109

DATA

112

}

110

}

113

111

114

#5#Function for adjusting the gene names

112

#5#Function for adjusting the gene names

115

gcnames <- function(DiData,usecol=1){

113

gcnames <- function(DiData,usecol=1){

116

nuruns <- dim(DiData)[2]

114

nuruns <- dim(DiData)[2]

117

i = 1

115

i = 1

118

nwnam <- rep("0",length.out=nuruns)

116

nwnam <- rep("0",length.out=nuruns)

119

for(i in 1:nuruns){

117

for(i in 1:nuruns){

120

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

118

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

121

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

119

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

122

} else{

120

} else{

123

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

121

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

124

}

122

}

125

123

126

}

124

}

127

nwnam

125

nwnam

128

126

129

}

127

}

130

128

131

129

132

130

133

#The Rest of this code will be used every time you want to change a data set

131

#The Rest of this code will be used every time you want to change a data set

134

132

135

#Getting the series matrix file

133

#Getting the series matrix file

136

print("Choose the series matrix file that you want to Analyze")

134

print("Choose the series matrix file that you want to Analyze")

137

alz <- file.choose()

135

alz <- file.choose()

138

136

139

#Getting the GPL file

137

#Getting the GPL file

140

print("Choose the GPL file that correlates with the above series matrix file")

138

print("Choose the GPL file that correlates with the above series matrix file")

141

genena <- file.choose()

139

genena <- file.choose()

142

140

143

141

144

#Set working directory based on the directory of the series matrix file Currently only works for windows

142

#Set working directory based on the directory of the series matrix file Currently only works for windows

145

##strsplit(alz,"[\\]") %>%

143

##strsplit(alz,"[\\]") %>%

146

## .[[1]] %>%

144

## .[[1]] %>%

147

## .[-length(.)] %>%

145

## .[-length(.)] %>%

148

## paste(.,collapse="/") %>%

146

## paste(.,collapse="/") %>%

149

## setwd()

147

## setwd()

150

148

151

#Find out if it is a soft GPL file or not

149

#Find out if it is a soft GPL file or not

152

soft <- strsplit(genena,"[\\|/]") %>%

150

soft <- strsplit(genena,"[\\|/]") %>%

153

.[[1]] %>%

151

.[[1]] %>%

154

.[length(.)] %>%

152

.[length(.)] %>%

155

grepl("soft",.)

153

grepl("soft",.)

156

154

157

#Working with the wordy part of the document

155

#Working with the wordy part of the document

158

alzword <- alz %>%

156

alzword <- alz %>%

159

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

157

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

160

filter(grepl("!Sample",X1))%>%

158

filter(grepl("!Sample",X1))%>%

161

filter(!grepl("!Sample_contact",X1))

159

filter(!grepl("!Sample_contact",X1))

162

160

163

##Changing row names and column names:

161

##Changing row names and column names:

164

ALZWORD <- t(alzword)

162

ALZWORD <- t(alzword)

165

rownames(ALZWORD)=NULL

163

rownames(ALZWORD)=NULL

166

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

164

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

167

ALZWORD <- chngrownm(ALZWORD)[-1,]

165

ALZWORD <- chngrownm(ALZWORD)[-1,]

168

ALZWORD <- ALZWORD%>%

166

ALZWORD <- ALZWORD%>%

169

as.data.frame()%>%

167

as.data.frame()%>%

170

dplyr::select(-starts_with("col"))

168

dplyr::select(-starts_with("col"))

171

169

172

##Reorganizing information within the columns

170

##Reorganizing information within the columns

173

ALZWORDF <- cinfo(ALZWORD)

171

ALZWORDF <- cinfo(ALZWORD)

174

172

175

173

176

#Working with Actual Data part of file

174

#Working with Actual Data part of file

177

alzdat <- alz %>%

175

alzdat <- alz %>%

178

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

176

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

179

ALZDAT <- t(alzdat[,-1])

177

ALZDAT <- t(alzdat[,-1])

180

rownames(ALZDAT)=NULL

178

rownames(ALZDAT)=NULL

181

179

182

180

183

##Gene ID to Gene Name

181

##Gene ID to Gene Name

184

###geneIDNam <- genena %>%

182

###geneIDNam <- genena %>%

185

### read_delim(delim="\t",comment = "#")%>%

183

### read_delim(delim="\t",comment = "#")%>%

186

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

184

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

187

###problems with the above for soft files

185

###problems with the above for soft files

188

if(soft == TRUE){

186

if(soft == TRUE){

189

gplnum <- strsplit(genena,"[\\|/]") %>%

187

gplnum <- strsplit(genena,"[\\|/]") %>%

190

.[[1]] %>%

188

.[[1]] %>%

191

.[length(.)] %>%

189

.[length(.)] %>%

192

gsub("\\D","",.)

190

gsub("\\D","",.)

193

#Check to see if there is already a file containing information on soft files

191

#Check to see if there is already a file containing information on soft files

194

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

192

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

195

if(fileex == 1){

193

if(fileex == 1){

196

#Check to see if this GPL soft file has been used before

194

#Check to see if this GPL soft file has been used before

197

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

195

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

198

.$GPL_FILE_NUM%>%

196

.$GPL_FILE_NUM%>%

199

grepl(gplnum,.) %>%

197

grepl(gplnum,.) %>%

200

sum()

198

sum()

201

if(IDF == 1){

199

if(IDF == 1){

202

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

200

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

203

.$GPL_FILE_NUM%>%

201

.$GPL_FILE_NUM%>%

204

grep(gplnum,.)

202

grep(gplnum,.)

205

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

203

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

206

.$LOC_ID %>%

204

.$LOC_ID %>%

207

.[IDLOCAL]

205

.[IDLOCAL]

208

geneIDNam <- genena %>%

206

geneIDNam <- genena %>%

209

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

207

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

210

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

208

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

211

}

209

}

212

if(IDF == 0){

210

if(IDF == 0){

213

#No information on this particular GPL file

211

#No information on this particular GPL file

214

idLOCGPL <- genena %>%

212

idLOCGPL <- genena %>%

215

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

213

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

216

t(.) %>%

214

t(.) %>%

217

grep("^\\D",.) %>%

215

grep("^\\D",.) %>%

218

length()-1

216

length()-1

219

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

217

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

220

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

218

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

221

geneIDNam <- genena %>%

219

geneIDNam <- genena %>%

222

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

220

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

223

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

221

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

224

}

222

}

225

}

223

}

226

if(fileex == 0){

224

if(fileex == 0){

227

#We must create a file that we can access for later use

225

#We must create a file that we can access for later use

228

idLOCGPL <- genena %>%

226

idLOCGPL <- genena %>%

229

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

227

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

230

t(.) %>%

228

t(.) %>%

231

grep("^\\D",.) %>%

229

grep("^\\D",.) %>%

232

length()-1

230

length()-1

233

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

231

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

234

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

232

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

235

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

233

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

236

geneIDNam <- genena %>%

234

geneIDNam <- genena %>%

237

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

235

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

238

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

236

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

239

}

237

}

240

}

238

}

241

if(soft == FALSE){

239

if(soft == FALSE){

242

geneIDNam <- genena %>%

240

geneIDNam <- genena %>%

243

read_delim(delim="\t",comment = "#")%>%

241

read_delim(delim="\t",comment = "#")%>%

244

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

242

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

245

}

243

}

246

244

247

##Labeling the gene IDs without names

245

##Labeling the gene IDs without names

248

geneIDNam <- NAFIXING(geneIDNam)

246

geneIDNam <- NAFIXING(geneIDNam)

249

247

250

##remove the whitespace

248

##remove the whitespace

251

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

249

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

252

250

253

##Changing the gene ID to gene name

251

##Changing the gene ID to gene name

254

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

252

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

255

colnames(ALZDAT) = ALZDAT1[1,]

253

colnames(ALZDAT) = ALZDAT1[1,]

256

254

257

255

258

##Adjusting the column names aka the gene names

256

##Adjusting the column names aka the gene names

259

colnames(ALZDAT) <- gcnames(ALZDAT)

257

colnames(ALZDAT) <- gcnames(ALZDAT)

260

258

261

259

262

#Full Data

260

#Full Data

263

Fullalzdw <- ALZDAT %>%

261

Fullalzdw <- ALZDAT %>%

264

as.data.frame() %>%

262

as.data.frame() %>%

265

cbind(ALZWORDF,.)

263

cbind(ALZWORDF,.)

266

264

267

265

268

#nfna <- strsplit(alz,"[\\]") %>%

266

#nfna <- strsplit(alz,"[\\]") %>%

269

# .[[1]] %>%

267

# .[[1]] %>%

270

# .[length(.)] %>%

268

# .[length(.)] %>%

271

# gsub("\\D","",.) %>%

269

# gsub("\\D","",.) %>%

272

# c("GSE",.,"after.txt") %>%

270

# c("GSE",.,"after.txt") %>%

273

# paste(collapse = "")

271

# paste(collapse = "")

274

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

272

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

275

#Perfect for excel viewing

273

#Perfect for excel viewing

276

nfnaex <- strsplit(alz,"[\\]") %>%

274

nfnaex <- strsplit(alz,"[\\]") %>%

277

.[[1]] %>%

275

.[[1]] %>%

278

.[length(.)] %>%

276

.[length(.)] %>%

279

gsub("\\D","",.) %>%

277

gsub("\\D","",.) %>%

280

c("GSE",.,"aftexcel.txt") %>%

278

c("GSE",.,"aftexcel.txt") %>%

281

paste(collapse = "")

279

paste(collapse = "")

282

write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)

280

write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)

283

281

284

282

285

283

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

error in NAFIXING fixed

 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	j <- 1
 	x <- 1
 	p <- 1
 	a <- 1
 	b <- 1
 	g <- 1
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"
 		}
 		if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
 		}
 		if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Sex",x)
 				x = x + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("PMI",p)
 				p = p + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Age",a)
 				a = a + 1
 			 }
 			if(grepl("braak|b&b",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Braak",b)
 				b = b + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Group",g)
 				g = g + 1
 			}
 		}
 		j = j + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
 		if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
 		if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
 		if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		}
 		if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
-	x <- 1
 	for(i in 1:row){
-		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){
+		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
-			GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])
+			GIDNAM[i,2] <- GIDNAM[i,1]
-			x <- x + 1
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
 	colGene <- dim(GeneName)[2]
 	j <- 1
 	for(j in 1:colGene){
 		chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])
 		DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
 		j = j+1
 	}
 	DATA
 }
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
 		} else{
 			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
 		}
 	}
 	nwnam
 }
 #The Rest of this code will be used every time you want to change a data set
 #Getting the series matrix file
 print("Choose the series matrix file that you want to Analyze")
 alz <- file.choose()
 #Getting the GPL file
 print("Choose the GPL file that correlates with the above series matrix file")
 genena <- file.choose()
 #Set working directory based on the directory of the series matrix file Currently only works for windows
 ##strsplit(alz,"[\\]") %>%
 ##	.[[1]] %>%
 ##	.[-length(.)] %>%
 ##	paste(.,collapse="/") %>%
 ##	setwd()
 #Find out if it is a soft GPL file or not
 soft <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	grepl("soft",.)
 #Working with the wordy part of the document
 alzword <- alz %>%
 	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 	filter(grepl("!Sample",X1))%>%
 	filter(!grepl("!Sample_contact",X1))
 ##Changing row names and column names:
 ALZWORD <- t(alzword)
 rownames(ALZWORD)=NULL
 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 ALZWORD <- chngrownm(ALZWORD)[-1,]
 ALZWORD <- ALZWORD%>%
 	as.data.frame()%>%
 	dplyr::select(-starts_with("col"))
 ##Reorganizing information within the columns
 ALZWORDF <- cinfo(ALZWORD)
 #Working with Actual Data part of file
 alzdat <- alz %>%
 	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 ALZDAT <- t(alzdat[,-1])
 rownames(ALZDAT)=NULL
 ##Gene ID to Gene Name
 ###geneIDNam <- genena %>%
 ###	read_delim(delim="\t",comment = "#")%>%
 ###	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 ###problems with the above for soft files
 if(soft == TRUE){
 	gplnum <- strsplit(genena,"[\\|/]") %>%
 		.[[1]] %>%
 		.[length(.)] %>%
 		gsub("\\D","",.)
 	#Check to see if there is already a file containing information on soft files
 	fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 	if(fileex == 1){
 		#Check to see if this GPL soft file has been used before
 		IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 			.$GPL_FILE_NUM%>%
 			grepl(gplnum,.) %>%
 			sum()
 		if(IDF == 1){
 			IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 				.$GPL_FILE_NUM%>%
 				grep(gplnum,.)
 			idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 				.$LOC_ID %>%
 				.[IDLOCAL]
 			geneIDNam <- genena %>%
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 		}
 		if(IDF == 0){
 			#No information on this particular GPL file
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 				t(.) %>%
 				grep("^\\D",.) %>%
 				length()-1
 			cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 				cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 			geneIDNam <- genena %>%
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 		}
 	}
 	if(fileex == 0){
 		#We must create a file that we can access for later use
 		idLOCGPL <- genena %>%
 			read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 			t(.) %>%
 			grep("^\\D",.) %>%
 			length()-1
 		Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 		colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 		write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 		geneIDNam <- genena %>%
 			read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 			dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 	}
 }
 if(soft == FALSE){
 	geneIDNam <- genena %>%
 	read_delim(delim="\t",comment = "#")%>%
 	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 }
 ##Labeling the gene IDs without names
 geneIDNam <- NAFIXING(geneIDNam)
 ##remove the whitespace
 geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 ##Changing the gene ID to gene name
 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 colnames(ALZDAT) = ALZDAT1[1,]
 ##Adjusting the column names aka the gene names
 colnames(ALZDAT) <- gcnames(ALZDAT)
 #Full Data
 Fullalzdw <- ALZDAT %>%
 	as.data.frame() %>%
 	cbind(ALZWORDF,.)
 #nfna <- strsplit(alz,"[\\]") %>%
 #	.[[1]] %>%
 #	.[length(.)] %>%
 #	gsub("\\D","",.) %>%
 #	c("GSE",.,"after.txt") %>%
 #	paste(collapse = "")
 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
 #Perfect for excel viewing
 nfnaex <- strsplit(alz,"[\\]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"aftexcel.txt") %>%
 	paste(collapse = "")
 write.table(t(Fullalzdw), file = nfnaex, sep = "\t",row.names = FALSE)