Efrain Gonzalez / Cleaning and Fixing Data with R

1

########################################################################

1

########################################################################

2

# Don't Use This Code Just Yet #

2

# Don't Use This Code Just Yet #

3

########################################################################

3

########################################################################

4

#Efrain H. Gonzalez

4

#Efrain H. Gonzalez

5

#6/16/2017

5

#6/16/2017

6

7

#Libraries required to run the code

7

#Libraries required to run the code

8

library(pryr)

8

library(pryr)

9

library(MASS)

9

library(MASS)

10

library(dplyr)

10

library(dplyr)

11

library(tidyr)

11

library(tidyr)

12

library(readr)

12

library(readr)

13

library(stringr)

13

library(stringr)

14

15

16

#Necessary Functions

16

#Necessary Functions

17

#1#Function for handling the changing of row names and column names

17

#1#Function for handling the changing of row names and column names

18

chngrownm <- function(mat){

18

chngrownm <- function(mat){

19

row <- dim(mat)[1]

19

row <- dim(mat)[1]

20

col <- dim(mat)[2]

20

col <- dim(mat)[2]

21

e <- 1

21

e <- 1

22

r <- 1

22

r <- 1

23

a <- 1

23

a <- 1

24

h <- 1

24

h <- 1

25

g <- 1

25

g <- 1

26

o <- 1

26

o <- 1

27

for(e in 1:col){

27

for(e in 1:col){

28

if("!Sample_source_name_ch1"==mat[1,e]){

28

if("!Sample_source_name_ch1"==mat[1,e]){

29

colnames(mat)[e] <- "Brain_Region"

29

colnames(mat)[e] <- "Brain_Region"

30

} else if("!Sample_title" == mat[1,e]){

30

} else if("!Sample_title" == mat[1,e]){

31

colnames(mat)[e] <- "Title"

31

colnames(mat)[e] <- "Title"

32

} else if("!Sample_geo_accession" == mat[1,e]){

32

} else if("!Sample_geo_accession" == mat[1,e]){

33

colnames(mat)[e] <- "ID_REF"

33

colnames(mat)[e] <- "ID_REF"

34

} else{

34

} else{

35

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

35

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

36

colnames(mat)[e] <- paste0("Sex",r)

36

colnames(mat)[e] <- paste0("Sex",r)

37

r = r + 1

37

r = r + 1

38

}

38

}

39

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

39

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

40

colnames(mat)[e] <- paste0("PMI",a)

40

colnames(mat)[e] <- paste0("PMI",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

43

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

44

colnames(mat)[e] <- paste0("Age",h)

44

colnames(mat)[e] <- paste0("Age",h)

45

h = h + 1

45

h = h + 1

46

}

46

}

47

if(grepl("braak|b&b",mat[2,e])==TRUE){

47

if(grepl("braak|b&b",mat[2,e])==TRUE){

48

colnames(mat)[e] <- paste0("Braak",g)

48

colnames(mat)[e] <- paste0("Braak",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

51

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

52

colnames(mat)[e] <- paste0("Group",o)

52

colnames(mat)[e] <- paste0("Group",o)

53

o = o + 1

53

o = o + 1

54

}

54

}

55

56

}

56

}

57

e = e + 1

57

e = e + 1

58

}

58

}

59

mat

59

mat

60

}

60

}

61

62

#2#Function for reorganizing information within the columns

62

#2#Function for reorganizing information within the columns

63

cinfo <- function(mat){

63

cinfo <- function(mat){

64

col <- dim(mat)[2]

64

col <- dim(mat)[2]

65

j <-2

65

j <-2

66

for(j in 2:col){

66

for(j in 2:col){

67

if(grepl("Group",colnames(mat)[j]) == TRUE){

67

if(grepl("Group",colnames(mat)[j]) == TRUE){

68

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

68

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

69

} else if(grepl("Age",colnames(mat)[j])==TRUE){

69

} else if(grepl("Age",colnames(mat)[j])==TRUE){

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

71

as.integer()

71

as.integer()

72

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

72

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub(".+:\\s","",mat[,j])

73

mat[,j] <- gsub(".+:\\s","",mat[,j])

74

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

74

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

76

as.numeric()

76

as.numeric()

77

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

77

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

for(i in 1:row){

91

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

93

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

94

}

95

i <- i + 1

95

i <- i + 1

96

}

96

}

97

GIDNAM

97

GIDNAM

98

}

98

}

99

100

#4#Function for changing the gene ID to gene name

100

#4#Function for changing the gene ID to gene name

101

cgeneID <- function(GeneName,DATA){

101

cgeneID <- function(GeneName,DATA){

102

nj <- t(GeneName)

102

nj <- t(GeneName)

103

nq <- t(DATA)

103

nq <- t(DATA)

104

colGene <- dim(nj)[2]

104

colGene <- dim(nj)[2]

105

colDATA <- dim(nq)[2]

105

colDATA <- dim(nq)[2]

106

j <- 1

106

j <- 1

107

for(j in 1:colDATA){

107

for(j in 1:colDATA){

108

#where is that gene id located within the GPL file

108

#where is that gene id located within the GPL file

109

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

109

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

110

if(is.na(sum(chngreq))==FALSE){

110

if(is.na(sum(chngreq))==FALSE){

111

if(sum(chngreq) > 0){

111

if(sum(chngreq) > 0){

112

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

112

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

113

}

113

}

114

}

114

}

115

j <- j + 1

115

j <- j + 1

116

}

116

}

117

nq

117

nq

118

}

118

}

119

#cgeneID <- function(GeneName,DATA){

119

#cgeneID <- function(GeneName,DATA){

120

# colGene <- dim(GeneName)[2]

120

# colGene <- dim(GeneName)[2]

121

# j <- 1

121

# j <- 1

122

# for(j in 1:colGene){

122

# for(j in 1:colGene){

123

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

123

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

124

# if(is.na(sum(chngsreq))==FALSE){

124

# if(is.na(sum(chngsreq))==FALSE){

125

# if(sum(chngsreq) > 0){

125

# if(sum(chngsreq) > 0){

126

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

126

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

# }

127

# }

128

# }

128

# }

129

# j = j+1

129

# j = j+1

130

# }

130

# }

131

# DATA

131

# DATA

132

#}

132

#}

133

134

#5#Function for adjusting the gene names

134

#5#Function for adjusting the gene names

135

gcnames <- function(DiData,usecol=1){

135

gcnames <- function(DiData,usecol=1){

136

nuruns <- dim(DiData)[2]

136

nuruns <- dim(DiData)[2]

137

i = 1

137

i = 1

138

nwnam <- rep("0",length.out=nuruns)

138

nwnam <- rep("0",length.out=nuruns)

139

for(i in 1:nuruns){

139

for(i in 1:nuruns){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

142

} else{

142

} else{

143

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

143

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

144

}

144

}

145

146

}

146

}

147

nwnam

147

nwnam

148

149

}

149

}

150

151

#6# Function for discretizing the data

151

#6# Function for discretizing the data

152

dndat <- function(NDATA){

152

dndat <- function(NDATA){

153

rownd <- dim(NDATA)[1]

153

rownd <- dim(NDATA)[1]

154

colnd <- dim(NDATA)[2]

154

colnd <- dim(NDATA)[2]

155

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

155

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

156

colnames(DDATA) <- colnames(NDATA)

156

colnames(DDATA) <- colnames(NDATA)

157

i <- 1

157

i <- 1

158

for(i in 1:rownd){

158

for(i in 1:rownd){

159

j <- 1

159

j <- 1

160

for(j in 1:colnd){

160

for(j in 1:colnd){

161

if(is.na(NDATA[i,j])==FALSE){

161

if(is.na(NDATA[i,j])==FALSE){

162

163

if(NDATA[i,j] < -1){

163

if(NDATA[i,j] < -1){

164

DDATA[i,j]=0L

164

DDATA[i,j]=0L

165

} else if(NDATA[i,j] > 1){

165

} else if(NDATA[i,j] > 1){

166

DDATA[i,j]=2L

166

DDATA[i,j]=2L

167

} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

167

} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

168

DDATA[i,j]=1L

168

DDATA[i,j]=1L

169

}

169

}

170

} else{

170

} else{

171

DDATA[i,j] = NDATA[i,j]

171

DDATA[i,j] = NDATA[i,j]

172

}

172

}

173

j = j + 1

173

j = j + 1

174

}

174

}

175

i = i + 1

175

i = i + 1

176

}

176

}

177

DDATA

177

DDATA

178

}

178

}

179

180

181

#MajorFunction#This is the function that does everything else

181

#MajorFunction#This is the function that does everything else

182

THEFT <- function(){

182

THEFT <- function(){

183

#Set working directory based on the directory of the series matrix file Currently only works for windows

183

#Set working directory based on the directory of the series matrix file Currently only works for windows

184

wd <- getwd()

184

wd <- getwd()

185

#list.files()

185

#list.files()

186

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

186

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

187

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

187

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

188

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

188

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

189

GSEfloc <- list.files()[GSEfileloc]

189

GSEfloc <- list.files()[GSEfileloc]

190

#ALL DATA FILES WILL BE CLEANED

190

#ALL DATA FILES WILL BE CLEANED

191

if(numDAT == 1){

191

if(numDAT == 1){

192

#indexing the data files

192

#indexing the data files

193

n <- 1

193

n <- 1

194

for(n in 1: length(GSEfloc)){

194

for(n in 1: length(GSEfloc)){

195

alz <- GSEfloc[n]

195

alz <- GSEfloc[n]

196

197

#Working with the wordy part of the document

197

#Working with the wordy part of the document

198

alzword <- alz %>%

198

alzword <- alz %>%

199

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

199

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

200

filter(grepl("!Sample",X1))%>%

200

filter(grepl("!Sample",X1))%>%

201

filter(!grepl("!Sample_contact",X1))

201

filter(!grepl("!Sample_contact",X1))

202

203

#Getting the GPL file

203

#Getting the GPL file

204

genena <- grep("_platform_id",alzword$X1) %>%

204

genena <- grep("_platform_id",alzword$X1) %>%

205

alzword$X2[.] %>%

205

alzword$X2[.] %>%

206

str_trim(.) %>%

206

str_trim(.) %>%

207

paste0("^",.,"\\D") %>%

207

paste0("^",.,"\\D") %>%

208

grep(.,list.files()) %>%

208

grep(.,list.files()) %>%

209

list.files()[.]

209

list.files()[.]

210

211

#Find out if it is a soft GPL file or not

211

#Find out if it is a soft GPL file or not

212

soft <- strsplit(genena,"[\\|/]") %>%

212

soft <- strsplit(genena,"[\\|/]") %>%

213

.[[1]] %>%

213

.[[1]] %>%

214

.[length(.)] %>%

214

.[length(.)] %>%

215

grepl("soft",.)

215

grepl("soft",.)

216

217

##Changing row names and column names:

217

##Changing row names and column names:

218

ALZWORD <- t(alzword)

218

ALZWORD <- t(alzword)

219

rownames(ALZWORD)=NULL

219

rownames(ALZWORD)=NULL

220

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

220

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

221

ALZWORD <- chngrownm(ALZWORD)[-1,]

221

ALZWORD <- chngrownm(ALZWORD)[-1,]

222

ALZWORD <- ALZWORD%>%

222

ALZWORD <- ALZWORD%>%

223

as.data.frame()%>%

223

as.data.frame()%>%

224

dplyr::select(-starts_with("col"))

224

dplyr::select(-starts_with("col"))

225

226

##Reorganizing information within the columns and final clinical data

226

##Reorganizing information within the columns and final clinical data

227

ALZWORDF <- cinfo(ALZWORD)

227

ALZWORDF <- cinfo(ALZWORD)

228

229

230

#Working with Actual Data part of file

230

#Working with Actual Data part of file

231

alzdat <- alz %>%

231

alzdat <- alz %>%

232

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

232

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

233

ALZDAT <- t(alzdat[,-1])

233

ALZDAT <- t(alzdat[,-1])

234

rownames(ALZDAT)=NULL

234

rownames(ALZDAT)=NULL

235

236

##Is there a clean version of the GPL file available?

236

##Is there a clean version of the GPL file available?

237

gplnum <- strsplit(genena,"[\\|/]") %>%

237

gplnum <- strsplit(genena,"[\\|/]") %>%

238

.[[1]] %>%

238

.[[1]] %>%

239

.[length(.)] %>%

239

.[length(.)] %>%

240

gsub("\\D","",.)

240

gsub("\\D","",.)

241

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

241

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

242

if(clfileex >= 1){

242

if(clfileex >= 1){

243

#use the clean version

243

#use the clean version

244

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

244

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

245

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

245

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

246

247

}

247

} else if(clfileex == 0){

248

else if(clfileex == 0){

249

##Lets Create a clean version

248

##Lets Create a clean version

250

249

251

##Gene ID to Gene Name

250

##Gene ID to Gene Name

252

if(soft == TRUE){

251

if(soft == TRUE){

253

#Check to see if there is already a file containing information on soft files

252

#Check to see if there is already a file containing information on soft files

254

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

253

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

255

if(fileex == 1){

254

if(fileex == 1){

256

#Check to see if this GPL soft file has been used before

255

#Check to see if this GPL soft file has been used before

257

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

256

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

258

.$GPL_FILE_NUM%>%

257

.$GPL_FILE_NUM%>%

259

grepl(gplnum,.) %>%

258

grepl(gplnum,.) %>%

260

sum()

259

sum()

261

if(IDF == 1){

260

if(IDF == 1){

262

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

261

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

263

.$GPL_FILE_NUM%>%

262

.$GPL_FILE_NUM%>%

264

grep(gplnum,.)

263

grep(gplnum,.)

265

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

264

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

266

.$LOC_ID %>%

265

.$LOC_ID %>%

267

.[IDLOCAL]

266

.[IDLOCAL]

268

geneIDNam <- genena %>%

267

geneIDNam <- genena %>%

269

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

268

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

270

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

269

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

271

} else if(IDF == 0){

270

} else if(IDF == 0){

272

#No information on this particular GPL file

271

#No information on this particular GPL file

273

idLOCGPL <- genena %>%

272

idLOCGPL <- genena %>%

274

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

273

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

275

t(.) %>%

274

t(.) %>%

276

grep("^ID\\s*$",.) %>%

275

grep("^ID\\s*$",.) %>%

277

-1

276

-1

278

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

277

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

279

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

278

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

280

geneIDNam <- genena %>%

279

geneIDNam <- genena %>%

281

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

280

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

282

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

281

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

283

}

282

}

284

} else if(fileex == 0){

283

} else if(fileex == 0){

285

#We must create a file that we can access for later use

284

#We must create a file that we can access for later use

286

idLOCGPL <- genena %>%

285

idLOCGPL <- genena %>%

287

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

286

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

288

t(.) %>%

287

t(.) %>%

289

grep("^ID\\s*$",.) %>%

288

grep("^ID\\s*$",.) %>%

290

-1

289

-1

291

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

290

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

292

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

291

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

293

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

292

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

294

geneIDNam <- genena %>%

293

geneIDNam <- genena %>%

295

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

294

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

296

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

295

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

297

}

296

}

298

} else if(soft == FALSE){

297

} else if(soft == FALSE){

299

geneIDNam <- genena %>%

298

geneIDNam <- genena %>%

300

read_delim(delim="\t",comment = "#")%>%

299

read_delim(delim="\t",comment = "#")%>%

301

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

300

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

302

}

301

}

303

302

304

##Labeling the gene IDs without names

303

##Labeling the gene IDs without names

305

geneIDNam <- NAFIXING(geneIDNam)

304

geneIDNam <- NAFIXING(geneIDNam)

306

305

307

##remove the whitespace

306

##remove the whitespace

308

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

307

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

309

308

310

##Here is the clean version

309

##Here is the clean version

311

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

310

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

312

}

311

}

313

312

314

313

315

314

316

##Changing the gene ID to gene name

315

##Changing the gene ID to gene name

317

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

316

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

318

colnames(ALZDAT) = ALZDAT1[1,]

317

colnames(ALZDAT) = ALZDAT1[1,]

319

318

320

319

321

##Adjusting the column names aka the gene names

320

##Adjusting the column names aka the gene names

322

colnames(ALZDAT) <- gcnames(ALZDAT)

321

colnames(ALZDAT) <- gcnames(ALZDAT)

323

322

324

323

325

#Full RAW Data

324

#Full RAW Data

326

Fullalzdwr <- ALZDAT %>%

325

Fullalzdwr <- ALZDAT %>%

327

as.data.frame() %>%

326

as.data.frame() %>%

328

cbind(ALZWORDF,.)

327

cbind(ALZWORDF,.)

329

328

330

#Raw file is output

329

#Raw file is output

331

nfnaex <- strsplit(alz,"[\\]") %>%

330

nfnaex <- strsplit(alz,"[\\]") %>%

332

.[[1]] %>%

331

.[[1]] %>%

333

.[length(.)] %>%

332

.[length(.)] %>%

334

gsub("\\D","",.) %>%

333

gsub("\\D","",.) %>%

335

c("GSE",.,"aftexcel.txt") %>%

334

c("GSE",.,"aftexcel.txt") %>%

336

paste(collapse = "")

335

paste(collapse = "")

337

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

336

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

338

337

339

338

340

339

341

#Now for the discretization part

340

#Now for the discretization part

342

##get the wordy part again

341

##get the wordy part again

343

rawword <- t(ALZWORDF)

342

rawword <- t(ALZWORDF)

344

343

345

##where is ID_REF located

344

##where is ID_REF located

346

hereim <- grep("ID_REF",rownames(rawword))

345

hereim <- grep("ID_REF",rownames(rawword))

347

346

348

##Subject Names GSM...

347

##Subject Names GSM...

349

subjnam <- rawword[hereim,]

348

subjnam <- rawword[hereim,]

350

349

351

##Getting the names for the rows

350

##Getting the names for the rows

352

namedarows <- rownames(rawword)[-hereim] %>%

351

namedarows <- rownames(rawword)[-hereim] %>%

353

as.data.frame()

352

as.data.frame()

354

RAWWORD <- rawword[-hereim,] %>%

353

RAWWORD <- rawword[-hereim,] %>%

355

as.data.frame() %>%

354

as.data.frame() %>%

356

bind_cols(namedarows,.)

355

bind_cols(namedarows,.)

357

z <- 1

356

z <- 1

358

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

357

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

359

for(z in 1:dim(RAWWORD)[1]){

358

for(z in 1:dim(RAWWORD)[1]){

360

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

359

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

361

z <- z + 1

360

z <- z + 1

362

}

361

}

363

362

364

colnames(naroww) <- "ROW_NAs"

363

colnames(naroww) <- "ROW_NAs"

365

RAWWORD <- bind_cols(RAWWORD,naroww)

364

RAWWORD <- bind_cols(RAWWORD,naroww)

366

365

367

366

368

roALZna <- t(ALZDAT) %>%

367

roALZna <- t(ALZDAT) %>%

369

rownames(.) %>%

368

rownames(.) %>%

370

as.data.frame(.)

369

as.data.frame(.)

371

colnames(roALZna) <- "ID_REF"

370

colnames(roALZna) <- "ID_REF"

372

371

373

RAWDAT <- t(ALZDAT) %>%

372

RAWDAT <- t(ALZDAT) %>%

374

as.data.frame(.)

373

as.data.frame(.)

375

colnames(RAWDAT) <- NULL

374

colnames(RAWDAT) <- NULL

376

rownames(RAWDAT) <- NULL

375

rownames(RAWDAT) <- NULL

377

376

378

RAWDAT2 <- RAWDAT %>%

377

RAWDAT2 <- RAWDAT %>%

379

cbind(roALZna,.) %>%

378

cbind(roALZna,.) %>%

380

dplyr::arrange(.,ID_REF)

379

dplyr::arrange(.,ID_REF)

381

380

382

##Editing the file for R processing

381

##Editing the file for R processing

383

RAWDATID <- RAWDAT2[,1] %>%

382

RAWDATID <- RAWDAT2[,1] %>%

384

as.matrix(.)

383

as.matrix(.)

385

384

386

RAWDATNUM <- RAWDAT2[,-1] %>%

385

RAWDATNUM <- RAWDAT2[,-1] %>%

387

mapply(.,FUN = as.numeric) %>%

386

mapply(.,FUN = as.numeric) %>%

388

t(.)

387

t(.)

389

388

390

##Consolidating genes with the same name

389

##Consolidating genes with the same name

391

###create empty matrix of size equal to tabRDATID

390

###create empty matrix of size equal to tabRDATID

392

tabRDATID <- table(RAWDATID)

391

tabRDATID <- table(RAWDATID)

393

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

392

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

394

j <- 1

393

j <- 1

395

for(j in 1:length(tabRDATID)){

394

for(j in 1:length(tabRDATID)){

396

##Putting the ones without duplicates in their new homes

395

##Putting the ones without duplicates in their new homes

397

if(tabRDATID[j] == 1){

396

if(tabRDATID[j] == 1){

398

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

397

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

399

} else if(tabRDATID[j] > 1){

398

} else if(tabRDATID[j] > 1){

400

##Averaging duplicates and putting them in their new homes

399

##Averaging duplicates and putting them in their new homes

401

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

400

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

402

}

401

}

403

j <- j + 1

402

j <- j + 1

404

}

403

}

405

404

406

##Scaling the Data

405

##Scaling the Data

407

scrawdat <- NuRDATN%>%

406

scrawdat <- NuRDATN%>%

408

scale()

407

scale()

409

attr(scrawdat,"scaled:center") <- NULL

408

attr(scrawdat,"scaled:center") <- NULL

410

attr(scrawdat,"scaled:scale") <- NULL

409

attr(scrawdat,"scaled:scale") <- NULL

411

colnames(scrawdat) <- rownames(tabRDATID)

410

colnames(scrawdat) <- rownames(tabRDATID)

412

411

413

##Discretized the Data

412

##Discretized the Data

414

dialzdat <- scrawdat %>%

413

dialzdat <- scrawdat %>%

415

dndat(.) %>%

414

dndat(.) %>%

416

t()%>%

415

t()%>%

417

as.data.frame(.)

416

as.data.frame(.)

418

colnames(dialzdat) <- rownames(RAWDATNUM)

417

colnames(dialzdat) <- rownames(RAWDATNUM)

419

418

420

##setting "ID_REF" as a new variable

419

##setting "ID_REF" as a new variable

421

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

420

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

422

colnames(geneNAM) <- "ID_REF"

421

colnames(geneNAM) <- "ID_REF"

423

rownames(dialzdat) <- NULL

422

rownames(dialzdat) <- NULL

424

dialzdat <-bind_cols(geneNAM,dialzdat)

423

dialzdat <-bind_cols(geneNAM,dialzdat)

425

424

426

##NAs in a column

425

##NAs in a column

427

x <- 2

426

x <- 2

428

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

427

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

429

nacol[1,1] = "COL_NAs"

428

nacol[1,1] = "COL_NAs"

430

for(x in 2:dim(dialzdat)[2]){

429

for(x in 2:dim(dialzdat)[2]){

431

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

430

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

432

x <- x + 1

431

x <- x + 1

433

}

432

}

434

colnames(nacol) <- colnames(dialzdat)

433

colnames(nacol) <- colnames(dialzdat)

435

dialzdat <- bind_rows(dialzdat,nacol)

434

dialzdat <- bind_rows(dialzdat,nacol)

436

435

437

##NAs in a row

436

##NAs in a row

438

y <- 1

437

y <- 1

439

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

438

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

440

for(y in 1:dim(dialzdat)[1]){

439

for(y in 1:dim(dialzdat)[1]){

441

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

440

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

442

y <- y + 1

441

y <- y + 1

443

}

442

}

444

colnames(narowd) <- "ROW_NAs"

443

colnames(narowd) <- "ROW_NAs"

445

dialzdat <- bind_cols(dialzdat,narowd)

444

dialzdat <- bind_cols(dialzdat,narowd)

446

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

445

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

447

colnames(RAWWORD) <- colnames(dialzdat)

446

colnames(RAWWORD) <- colnames(dialzdat)

448

##converting to character so that the clinical can be brought together with discrete data

447

##converting to character so that the clinical can be brought together with discrete data

449

k <- 2

448

k <- 2

450

for(k in 2:dim(dialzdat)[2]-1){

449

for(k in 2:dim(dialzdat)[2]-1){

451

dialzdat[,k] <- as.character(dialzdat[,k])

450

dialzdat[,k] <- as.character(dialzdat[,k])

452

k <- k + 1

451

k <- k + 1

453

}

452

}

454

#The End the full data

453

#The End the full data

455

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

454

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

456

455

457

#Produces Discrete file

456

#Produces Discrete file

458

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

457

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

459

.[[1]] %>%

458

.[[1]] %>%

460

.[length(.)] %>%

459

.[length(.)] %>%

461

gsub("\\D","",.) %>%

460

gsub("\\D","",.) %>%

462

c("GSE",.,"dscrt.txt") %>%

461

c("GSE",.,"dscrt.txt") %>%

463

paste(collapse = "")

462

paste(collapse = "")

464

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

463

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

465

n <- n +1

464

n <- n +1

466

}

465

}

467

} else if(numDAT == 2){

466

} else if(numDAT == 2){

468

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

467

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

469

468

470

#All the files you want to analyze

469

#All the files you want to analyze

471

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

470

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

472

if(length(ANDIS) == 0){

471

if(length(ANDIS) == 0){

473

#Spit out a warning

472

#Spit out a warning

474

warning("You did not select any files and so no cleaning will be performed")

473

warning("You did not select any files and so no cleaning will be performed")

475

} else{

474

} else{

476

#indexing the data files

475

#indexing the data files

477

n <- 1

476

n <- 1

478

for(n in 1: length(ANDIS)){

477

for(n in 1: length(ANDIS)){

479

alz <- ANDIS[n]

478

alz <- ANDIS[n]

480

479

481

#Working with the wordy part of the document

480

#Working with the wordy part of the document

482

alzword <- alz %>%

481

alzword <- alz %>%

483

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

482

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

484

filter(grepl("!Sample",X1))%>%

483

filter(grepl("!Sample",X1))%>%

485

filter(!grepl("!Sample_contact",X1))

484

filter(!grepl("!Sample_contact",X1))

486

485

487

#Getting the GPL file

486

#Getting the GPL file

488

genena <- grep("_platform_id",alzword$X1) %>%

487

genena <- grep("_platform_id",alzword$X1) %>%

489

alzword$X2[.] %>%

488

alzword$X2[.] %>%

490

str_trim(.) %>%

489

str_trim(.) %>%

491

paste0("^",.,"\\D") %>%

490

paste0("^",.,"\\D") %>%

492

grep(.,list.files()) %>%

491

grep(.,list.files()) %>%

493

list.files()[.]

492

list.files()[.]

494

493

495

#Find out if it is a soft GPL file or not

494

#Find out if it is a soft GPL file or not

496

soft <- strsplit(genena,"[\\|/]") %>%

495

soft <- strsplit(genena,"[\\|/]") %>%

497

.[[1]] %>%

496

.[[1]] %>%

498

.[length(.)] %>%

497

.[length(.)] %>%

499

grepl("soft",.)

498

grepl("soft",.)

500

499

501

##Changing row names and column names:

500

##Changing row names and column names:

502

ALZWORD <- t(alzword)

501

ALZWORD <- t(alzword)

503

rownames(ALZWORD)=NULL

502

rownames(ALZWORD)=NULL

504

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

503

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

505

ALZWORD <- chngrownm(ALZWORD)[-1,]

504

ALZWORD <- chngrownm(ALZWORD)[-1,]

506

ALZWORD <- ALZWORD%>%

505

ALZWORD <- ALZWORD%>%

507

as.data.frame()%>%

506

as.data.frame()%>%

508

dplyr::select(-starts_with("col"))

507

dplyr::select(-starts_with("col"))

509

508

510

##Reorganizing information within the columns and final clinical data

509

##Reorganizing information within the columns and final clinical data

511

ALZWORDF <- cinfo(ALZWORD)

510

ALZWORDF <- cinfo(ALZWORD)

512

511

513

512

514

#Working with Actual Data part of file

513

#Working with Actual Data part of file

515

alzdat <- alz %>%

514

alzdat <- alz %>%

516

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

515

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

517

ALZDAT <- t(alzdat[,-1])

516

ALZDAT <- t(alzdat[,-1])

518

rownames(ALZDAT)=NULL

517

rownames(ALZDAT)=NULL

519

518

520

##Is there a clean version of the GPL file available?

519

##Is there a clean version of the GPL file available?

521

gplnum <- strsplit(genena,"[\\|/]") %>%

520

gplnum <- strsplit(genena,"[\\|/]") %>%

522

.[[1]] %>%

521

.[[1]] %>%

523

.[length(.)] %>%

522

.[length(.)] %>%

524

gsub("\\D","",.)

523

gsub("\\D","",.)

525

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

524

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

526

if(clfileex >= 1){

525

if(clfileex >= 1){

527

#use the clean version

526

#use the clean version

528

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

527

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

529

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

528

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

530

529

531

} else if(clfileex == 0){

530

} else if(clfileex == 0){

532

##Lets Create a clean version

531

##Lets Create a clean version

533

532

534

##Gene ID to Gene Name

533

##Gene ID to Gene Name

535

if(soft == TRUE){

534

if(soft == TRUE){

536

#Check to see if there is already a file containing information on soft files

535

#Check to see if there is already a file containing information on soft files

537

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

536

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

538

if(fileex == 1){

537

if(fileex == 1){

539

#Check to see if this GPL soft file has been used before

538

#Check to see if this GPL soft file has been used before

540

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

539

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

541

.$GPL_FILE_NUM%>%

540

.$GPL_FILE_NUM%>%

542

grepl(gplnum,.) %>%

541

grepl(gplnum,.) %>%

543

sum()

542

sum()

544

if(IDF == 1){

543

if(IDF == 1){

545

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

544

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

546

.$GPL_FILE_NUM%>%

545

.$GPL_FILE_NUM%>%

547

grep(gplnum,.)

546

grep(gplnum,.)

548

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

547

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

549

.$LOC_ID %>%

548

.$LOC_ID %>%

550

.[IDLOCAL]

549

.[IDLOCAL]

551

geneIDNam <- genena %>%

550

geneIDNam <- genena %>%

552

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

551

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

553

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

552

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

554

} else if(IDF == 0){

553

} else if(IDF == 0){

555

#No information on this particular GPL file

554

#No information on this particular GPL file

556

idLOCGPL <- genena %>%

555

idLOCGPL <- genena %>%

557

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

556

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

558

t(.) %>%

557

t(.) %>%

559

grep("^ID\\s*$",.) %>%

558

grep("^ID\\s*$",.) %>%

560

-1

559

-1

561

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

560

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

562

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

561

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

563

geneIDNam <- genena %>%

562

geneIDNam <- genena %>%

564

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

563

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

565

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

564

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

566

}

565

}

567

} else if(fileex == 0){

566

} else if(fileex == 0){

568

#We must create a file that we can access for later use

567

#We must create a file that we can access for later use

569

idLOCGPL <- genena %>%

568

idLOCGPL <- genena %>%

570

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

569

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

571

t(.) %>%

570

t(.) %>%

572

grep("^ID\\s*$",.) %>%

571

grep("^ID\\s*$",.) %>%

573

-1

572

-1

574

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

573

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

575

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

574

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

576

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

575

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

577

geneIDNam <- genena %>%

576

geneIDNam <- genena %>%

578

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

577

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

579

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

578

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

580

}

579

}

581

} else if(soft == FALSE){

580

} else if(soft == FALSE){

582

geneIDNam <- genena %>%

581

geneIDNam <- genena %>%

583

read_delim(delim="\t",comment = "#")%>%

582

read_delim(delim="\t",comment = "#")%>%

584

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

583

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

585

}

584

}

586

585

587

##Labeling the gene IDs without names

586

##Labeling the gene IDs without names

588

geneIDNam <- NAFIXING(geneIDNam)

587

geneIDNam <- NAFIXING(geneIDNam)

589

588

590

##remove the whitespace

589

##remove the whitespace

591

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

590

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

592

591

593

##Here is the clean version

592

##Here is the clean version

594

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

593

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

595

}

594

}

596

595

597

596

598

597

599

##Changing the gene ID to gene name

598

##Changing the gene ID to gene name

600

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

599

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

601

colnames(ALZDAT) = ALZDAT1[1,]

600

colnames(ALZDAT) = ALZDAT1[1,]

602

601

603

602

604

##Adjusting the column names aka the gene names

603

##Adjusting the column names aka the gene names

605

colnames(ALZDAT) <- gcnames(ALZDAT)

604

colnames(ALZDAT) <- gcnames(ALZDAT)

606

605

607

606

608

#Full RAW Data

607

#Full RAW Data

609

Fullalzdwr <- ALZDAT %>%

608

Fullalzdwr <- ALZDAT %>%

610

as.data.frame() %>%

609

as.data.frame() %>%

611

cbind(ALZWORDF,.)

610

cbind(ALZWORDF,.)

612

611

613

#Raw file is output

612

#Raw file is output

614

nfnaex <- strsplit(alz,"[\\]") %>%

613

nfnaex <- strsplit(alz,"[\\]") %>%

615

.[[1]] %>%

614

.[[1]] %>%

616

.[length(.)] %>%

615

.[length(.)] %>%

617

gsub("\\D","",.) %>%

616

gsub("\\D","",.) %>%

618

c("GSE",.,"aftexcel.txt") %>%

617

c("GSE",.,"aftexcel.txt") %>%

619

paste(collapse = "")

618

paste(collapse = "")

620

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

619

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

621

620

622

621

623

622

624

#Now for the discretization part

623

#Now for the discretization part

625

##get the wordy part again

624

##get the wordy part again

626

rawword <- t(ALZWORDF)

625

rawword <- t(ALZWORDF)

627

626

628

##where is ID_REF located

627

##where is ID_REF located

629

hereim <- grep("ID_REF",rownames(rawword))

628

hereim <- grep("ID_REF",rownames(rawword))

630

629

631

##Subject Names GSM...

630

##Subject Names GSM...

632

subjnam <- rawword[hereim,]

631

subjnam <- rawword[hereim,]

633

632

634

##Getting the names for the rows

633

##Getting the names for the rows

635

namedarows <- rownames(rawword)[-hereim] %>%

634

namedarows <- rownames(rawword)[-hereim] %>%

636

as.data.frame()

635

as.data.frame()

637

RAWWORD <- rawword[-hereim,] %>%

636

RAWWORD <- rawword[-hereim,] %>%

638

as.data.frame() %>%

637

as.data.frame() %>%

639

bind_cols(namedarows,.)

638

bind_cols(namedarows,.)

640

z <- 1

639

z <- 1

641

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

640

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

642

for(z in 1:dim(RAWWORD)[1]){

641

for(z in 1:dim(RAWWORD)[1]){

643

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

642

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

644

z <- z + 1

643

z <- z + 1

645

}

644

}

646

645

647

colnames(naroww) <- "ROW_NAs"

646

colnames(naroww) <- "ROW_NAs"

648

RAWWORD <- bind_cols(RAWWORD,naroww)

647

RAWWORD <- bind_cols(RAWWORD,naroww)

649

648

650

649

651

roALZna <- t(ALZDAT) %>%

650

roALZna <- t(ALZDAT) %>%

652

rownames(.) %>%

651

rownames(.) %>%

653

as.data.frame(.)

652

as.data.frame(.)

654

colnames(roALZna) <- "ID_REF"

653

colnames(roALZna) <- "ID_REF"

655

654

656

RAWDAT <- t(ALZDAT) %>%

655

RAWDAT <- t(ALZDAT) %>%

657

as.data.frame(.)

656

as.data.frame(.)

658

colnames(RAWDAT) <- NULL

657

colnames(RAWDAT) <- NULL

659

rownames(RAWDAT) <- NULL

658

rownames(RAWDAT) <- NULL

660

659

661

RAWDAT2 <- RAWDAT %>%

660

RAWDAT2 <- RAWDAT %>%

662

cbind(roALZna,.) %>%

661

cbind(roALZna,.) %>%

663

dplyr::arrange(.,ID_REF)

662

dplyr::arrange(.,ID_REF)

664

663

665

##Editing the file for R processing

664

##Editing the file for R processing

666

RAWDATID <- RAWDAT2[,1] %>%

665

RAWDATID <- RAWDAT2[,1] %>%

667

as.matrix(.)

666

as.matrix(.)

668

667

669

RAWDATNUM <- RAWDAT2[,-1] %>%

668

RAWDATNUM <- RAWDAT2[,-1] %>%

670

mapply(.,FUN = as.numeric) %>%

669

mapply(.,FUN = as.numeric) %>%

671

t(.)

670

t(.)

672

671

673

##Consolidating genes with the same name

672

##Consolidating genes with the same name

674

###create empty matrix of size equal to tabRDATID

673

###create empty matrix of size equal to tabRDATID

675

tabRDATID <- table(RAWDATID)

674

tabRDATID <- table(RAWDATID)

676

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

675

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

677

j <- 1

676

j <- 1

678

for(j in 1:length(tabRDATID)){

677

for(j in 1:length(tabRDATID)){

679

##Putting the ones without duplicates in their new homes

678

##Putting the ones without duplicates in their new homes

680

if(tabRDATID[j] == 1){

679

if(tabRDATID[j] == 1){

681

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

680

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

682

} else if(tabRDATID[j] > 1){

681

} else if(tabRDATID[j] > 1){

683

##Averaging duplicates and putting them in their new homes

682

##Averaging duplicates and putting them in their new homes

684

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

683

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

685

}

684

}

686

j <- j + 1

685

j <- j + 1

687

}

686

}

688

687

689

##Scaling the Data

688

##Scaling the Data

690

scrawdat <- NuRDATN%>%

689

scrawdat <- NuRDATN%>%

691

scale()

690

scale()

692

attr(scrawdat,"scaled:center") <- NULL

691

attr(scrawdat,"scaled:center") <- NULL

693

attr(scrawdat,"scaled:scale") <- NULL

692

attr(scrawdat,"scaled:scale") <- NULL

694

colnames(scrawdat) <- rownames(tabRDATID)

693

colnames(scrawdat) <- rownames(tabRDATID)

695

694

696

##Discretized the Data

695

##Discretized the Data

697

dialzdat <- scrawdat %>%

696

dialzdat <- scrawdat %>%

698

dndat(.) %>%

697

dndat(.) %>%

699

t()%>%

698

t()%>%

700

as.data.frame(.)

699

as.data.frame(.)

701

colnames(dialzdat) <- rownames(RAWDATNUM)

700

colnames(dialzdat) <- rownames(RAWDATNUM)

702

701

703

##setting "ID_REF" as a new variable

702

##setting "ID_REF" as a new variable

704

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

703

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

705

colnames(geneNAM) <- "ID_REF"

704

colnames(geneNAM) <- "ID_REF"

706

rownames(dialzdat) <- NULL

705

rownames(dialzdat) <- NULL

707

dialzdat <-bind_cols(geneNAM,dialzdat)

706

dialzdat <-bind_cols(geneNAM,dialzdat)

708

707

709

##NAs in a column

708

##NAs in a column

710

x <- 2

709

x <- 2

711

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

710

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

712

nacol[1,1] = "COL_NAs"

711

nacol[1,1] = "COL_NAs"

713

for(x in 2:dim(dialzdat)[2]){

712

for(x in 2:dim(dialzdat)[2]){

714

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

713

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

715

x <- x + 1

714

x <- x + 1

716

}

715

}

717

colnames(nacol) <- colnames(dialzdat)

716

colnames(nacol) <- colnames(dialzdat)

718

dialzdat <- bind_rows(dialzdat,nacol)

717

dialzdat <- bind_rows(dialzdat,nacol)

719

718

720

##NAs in a row

719

##NAs in a row

721

y <- 1

720

y <- 1

722

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

721

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

723

for(y in 1:dim(dialzdat)[1]){

722

for(y in 1:dim(dialzdat)[1]){

724

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

723

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

725

y <- y + 1

724

y <- y + 1

726

}

725

}

727

colnames(narowd) <- "ROW_NAs"

726

colnames(narowd) <- "ROW_NAs"

728

dialzdat <- bind_cols(dialzdat,narowd)

727

dialzdat <- bind_cols(dialzdat,narowd)

729

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

728

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

730

colnames(RAWWORD) <- colnames(dialzdat)

729

colnames(RAWWORD) <- colnames(dialzdat)

731

##converting to character so that the clinical can be brought together with discrete data

730

##converting to character so that the clinical can be brought together with discrete data

732

k <- 2

731

k <- 2

733

for(k in 2:dim(dialzdat)[2]-1){

732

for(k in 2:dim(dialzdat)[2]-1){

734

dialzdat[,k] <- as.character(dialzdat[,k])

733

dialzdat[,k] <- as.character(dialzdat[,k])

735

k <- k + 1

734

k <- k + 1

736

}

735

}

737

#The End the full data

736

#The End the full data

738

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

737

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

739

738

740

#Produces Discrete file

739

#Produces Discrete file

741

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

740

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

742

.[[1]] %>%

741

.[[1]] %>%

743

.[length(.)] %>%

742

.[length(.)] %>%

744

gsub("\\D","",.) %>%

743

gsub("\\D","",.) %>%

745

c("GSE",.,"dscrt.txt") %>%

744

c("GSE",.,"dscrt.txt") %>%

746

paste(collapse = "")

745

paste(collapse = "")

747

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

746

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

748

747

749

748

750

n <- n + 1

749

n <- n + 1

751

}

750

}

752

}

751

}

753

}

752

}

754

}

753

}

755

#The Rest of this code will be used every time you want to change a data set

754

#The Rest of this code will be used every time you want to change a data set

756

THEFT()

755

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Don't use this code yet

 ########################################################################
 #               Don't Use This Code Just Yet                           #
 ########################################################################
 #Efrain H. Gonzalez
 #6/16/2017
 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	e <- 1
 	r <- 1
 	a <- 1
 	h <- 1
 	g <- 1
 	o <- 1
 	for(e in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,e]){
 			colnames(mat)[e] <- "Brain_Region"
 		} else if("!Sample_title" == mat[1,e]){
 			colnames(mat)[e] <- "Title"
 		} else if("!Sample_geo_accession" == mat[1,e]){
 			colnames(mat)[e] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Sex",r)
 				r = r + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("PMI",a)
 				a = a + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Age",h)
 				h = h + 1
 			 }
 			if(grepl("braak|b&b",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Braak",g)
 				g = g + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Group",o)
 				o = o + 1
 			}
 		}
 		e = e + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		} else if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		} else if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		} else if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		} else if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
 	nj <- t(GeneName)
 	nq <- t(DATA)
 	colGene <- dim(nj)[2]
 	colDATA <- dim(nq)[2]
 	j <- 1
 	for(j in 1:colDATA){
 		#where is that gene id located within the GPL file
 		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
 		if(is.na(sum(chngreq))==FALSE){
 			if(sum(chngreq) > 0){
 			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
 			}
 		}
 		j <- j + 1
 	}
 	nq
 }
 #cgeneID <- function(GeneName,DATA){
 #    colGene <- dim(GeneName)[2]
 #     j <- 1
 #     for(j in 1:colGene){
 #	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
 #	if(is.na(sum(chngsreq))==FALSE){
 #		if(sum(chngsreq) > 0){
 #			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 #		}
 #	}
 #	j = j+1
 #	}
 #	DATA
 #}
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
 		} else{
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
 		}
 	}
 	nwnam
 }
 #6# Function for discretizing the data
 dndat <- function(NDATA){
 	rownd <- dim(NDATA)[1]
 	colnd <- dim(NDATA)[2]
 	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
 	colnames(DDATA) <- colnames(NDATA)
 	i <- 1
 	for(i in 1:rownd){
 		j <- 1
 		for(j in 1:colnd){
 			if(is.na(NDATA[i,j])==FALSE){
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
 				} else if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
 				} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
 				DDATA[i,j] = NDATA[i,j]
 			}
 			j = j + 1
 		}
 		i = i + 1
 	}
 	DDATA
 }
 #MajorFunction#This is the function that does everything else
 THEFT <- function(){
 	#Set working directory based on the directory of the series matrix file Currently only works for windows
 	wd <- getwd()
 	#list.files()
 	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
 	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
 	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
 	GSEfloc <- list.files()[GSEfileloc]
 	#ALL DATA FILES WILL BE CLEANED
 	if(numDAT == 1){
 		#indexing the data files
 		n <- 1
 		for(n in 1: length(GSEfloc)){
 			alz <- GSEfloc[n]
 			#Working with the wordy part of the document
 			alzword <- alz %>%
 				read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 				filter(grepl("!Sample",X1))%>%
 				filter(!grepl("!Sample_contact",X1))
 			#Getting the GPL file
 			genena <- grep("_platform_id",alzword$X1) %>%
 				alzword$X2[.] %>%
 				str_trim(.) %>%
 				paste0("^",.,"\\D") %>%
 				grep(.,list.files()) %>%
 				list.files()[.]
 			#Find out if it is a soft GPL file or not
 			soft <- strsplit(genena,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				grepl("soft",.)
 			##Changing row names and column names:
 			ALZWORD <- t(alzword)
 			rownames(ALZWORD)=NULL
 			colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 			ALZWORD <- chngrownm(ALZWORD)[-1,]
 			ALZWORD <- ALZWORD%>%
 				as.data.frame()%>%
 				dplyr::select(-starts_with("col"))
 			##Reorganizing information within the columns and final clinical data
 			ALZWORDF <- cinfo(ALZWORD)
 			#Working with Actual Data part of file
 			alzdat <- alz %>%
 				read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 			ALZDAT <- t(alzdat[,-1])
 			rownames(ALZDAT)=NULL
 			##Is there a clean version of the GPL file available?
 			gplnum <- strsplit(genena,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.)
 			clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 			if(clfileex >= 1){
 			#use the clean version
 			geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
-			}
+			} else if(clfileex == 0){
-			else if(clfileex == 0){
 			##Lets Create a clean version
 			##Gene ID to Gene Name
 				if(soft == TRUE){
 					#Check to see if there is already a file containing information on soft files
 					fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 					if(fileex == 1){
 						#Check to see if this GPL soft file has been used before
 						IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 							.$GPL_FILE_NUM%>%
 							grepl(gplnum,.) %>%
 							sum()
 						if(IDF == 1){
 							IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 								.$GPL_FILE_NUM%>%
 								grep(gplnum,.)
 							idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 									.$LOC_ID %>%
 									.[IDLOCAL]
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						} else if(IDF == 0){
 							#No information on this particular GPL file
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 								t(.) %>%
 								grep("^ID\\s*$",.) %>%
 								-1
 							cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 								cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 					} else if(fileex == 0){
 						#We must create a file that we can access for later use
 						idLOCGPL <- genena %>%
 							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 							t(.) %>%
 							grep("^ID\\s*$",.) %>%
 							-1
 						Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 						colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 						write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 					}
 	 			} else if(soft == FALSE){
 					geneIDNam <- genena %>%
 						read_delim(delim="\t",comment = "#")%>%
 						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 				}
 				##Labeling the gene IDs without names
 				geneIDNam <- NAFIXING(geneIDNam)
 				##remove the whitespace
 				geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 				##Here is the clean version
 				write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 			}
 			##Changing the gene ID to gene name
 			ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 			colnames(ALZDAT) = ALZDAT1[1,]
 			##Adjusting the column names aka the gene names
 			colnames(ALZDAT) <- gcnames(ALZDAT)
 			#Full RAW Data
 			Fullalzdwr <- ALZDAT %>%
 				as.data.frame() %>%
 				cbind(ALZWORDF,.)
 			#Raw file is output
 			nfnaex <- strsplit(alz,"[\\]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.) %>%
 				c("GSE",.,"aftexcel.txt") %>%
 				paste(collapse = "")
 			write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 			#Now for the discretization part
 			##get the wordy part again
 			rawword <- t(ALZWORDF)
 			##where is ID_REF located
 			hereim <- grep("ID_REF",rownames(rawword))
 			##Subject Names GSM...
 			subjnam <- rawword[hereim,]
 			##Getting the names for the rows
 			namedarows <- rownames(rawword)[-hereim] %>%
 				as.data.frame()
 			RAWWORD <- rawword[-hereim,] %>%
 				as.data.frame() %>%
 				bind_cols(namedarows,.)
 			z <- 1
 			naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 			for(z in 1:dim(RAWWORD)[1]){
 				naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 				z <- z + 1
 			}
 			colnames(naroww) <- "ROW_NAs"
 			RAWWORD <- bind_cols(RAWWORD,naroww)
 			roALZna <- t(ALZDAT) %>%
 				rownames(.) %>%
 				as.data.frame(.)
 			colnames(roALZna) <- "ID_REF"
 			RAWDAT <- t(ALZDAT) %>%
 				as.data.frame(.)
 			colnames(RAWDAT) <- NULL
 			rownames(RAWDAT) <- NULL
 			RAWDAT2 <- RAWDAT %>%
 				cbind(roALZna,.) %>%
 				dplyr::arrange(.,ID_REF)
 			##Editing the file for R processing
 			RAWDATID <- RAWDAT2[,1] %>%
 				as.matrix(.)
 			RAWDATNUM <- RAWDAT2[,-1] %>%
 				mapply(.,FUN = as.numeric) %>%
 				t(.)
 			##Consolidating genes with the same name
 			###create empty matrix of size equal  to tabRDATID
 			tabRDATID <- table(RAWDATID)
 			NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 			j <- 1
 			for(j in 1:length(tabRDATID)){
 				##Putting the ones without duplicates in their new homes
 				if(tabRDATID[j] == 1){
 					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 				} else if(tabRDATID[j] > 1){
 				    ##Averaging duplicates and putting them in their new homes
 					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 				}
 				j <- j + 1
 			}
 			##Scaling the Data
 			scrawdat <- NuRDATN%>%
 				scale()
 			attr(scrawdat,"scaled:center") <- NULL
 			attr(scrawdat,"scaled:scale") <- NULL
 			colnames(scrawdat) <- rownames(tabRDATID)
 			##Discretized the Data
 			dialzdat <- scrawdat %>%
 				dndat(.) %>%
 				t()%>%
 				as.data.frame(.)
 			colnames(dialzdat) <- rownames(RAWDATNUM)
 			##setting "ID_REF" as a new variable
 			geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 			colnames(geneNAM) <- "ID_REF"
 			rownames(dialzdat) <- NULL
 			dialzdat <-bind_cols(geneNAM,dialzdat)
 			##NAs in a column
 			x <- 2
 			nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 			nacol[1,1] = "COL_NAs"
 			for(x in 2:dim(dialzdat)[2]){
 				nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 				x <- x + 1
 			}
 			colnames(nacol) <- colnames(dialzdat)
 			dialzdat <- bind_rows(dialzdat,nacol)
 			##NAs in a row
 			y <- 1
 			narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 			for(y in 1:dim(dialzdat)[1]){
 				narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 				y <- y + 1
 			}
 			colnames(narowd) <- "ROW_NAs"
 			dialzdat <- bind_cols(dialzdat,narowd)
 			colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 			colnames(RAWWORD) <- colnames(dialzdat)
 			##converting to character so that the clinical can be brought together with discrete data
 			k <- 2
 			for(k in 2:dim(dialzdat)[2]-1){
 				dialzdat[,k] <- as.character(dialzdat[,k])
 				k <- k + 1
 			}
 			#The End the full data
 			Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 			#Produces Discrete file
 			nfnaex2 <- strsplit(alz,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.) %>%
 				c("GSE",.,"dscrt.txt") %>%
 				paste(collapse = "")
 			write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 			n <- n +1
 		}
 	} else if(numDAT == 2){
 	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
 		#All the files you want to analyze
 		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
 		if(length(ANDIS) == 0){
 			#Spit out a warning
 			warning("You did not select any files and so no cleaning will be performed")
 		} else{
 			#indexing the data files
 			n <- 1
 			for(n in 1: length(ANDIS)){
 				alz <- ANDIS[n]
 				#Working with the wordy part of the document
 				alzword <- alz %>%
 					read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 					filter(grepl("!Sample",X1))%>%
 					filter(!grepl("!Sample_contact",X1))
 				#Getting the GPL file
 				genena <- grep("_platform_id",alzword$X1) %>%
 					alzword$X2[.] %>%
 					str_trim(.) %>%
 					paste0("^",.,"\\D") %>%
 					grep(.,list.files()) %>%
 					list.files()[.]
 				#Find out if it is a soft GPL file or not
 				soft <- strsplit(genena,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					grepl("soft",.)
 				##Changing row names and column names:
 				ALZWORD <- t(alzword)
 				rownames(ALZWORD)=NULL
 				colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 				ALZWORD <- chngrownm(ALZWORD)[-1,]
 				ALZWORD <- ALZWORD%>%
 					as.data.frame()%>%
 					dplyr::select(-starts_with("col"))
 				##Reorganizing information within the columns and final clinical data
 				ALZWORDF <- cinfo(ALZWORD)
 				#Working with Actual Data part of file
 				alzdat <- alz %>%
 					read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 				ALZDAT <- t(alzdat[,-1])
 				rownames(ALZDAT)=NULL
 				##Is there a clean version of the GPL file available?
 				gplnum <- strsplit(genena,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.)
 				clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 				if(clfileex >= 1){
 				#use the clean version
 				geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 				} else if(clfileex == 0){
 				##Lets Create a clean version
 				##Gene ID to Gene Name
 					if(soft == TRUE){
 						#Check to see if there is already a file containing information on soft files
 						fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 						if(fileex == 1){
 							#Check to see if this GPL soft file has been used before
 							IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 								.$GPL_FILE_NUM%>%
 								grepl(gplnum,.) %>%
 								sum()
 							if(IDF == 1){
 								IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 									.$GPL_FILE_NUM%>%
 									grep(gplnum,.)
 								idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 										.$LOC_ID %>%
 										.[IDLOCAL]
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							} else if(IDF == 0){
 								#No information on this particular GPL file
 								idLOCGPL <- genena %>%
 									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 									t(.) %>%
 									grep("^ID\\s*$",.) %>%
 									-1
 								cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 									cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							}
 						} else if(fileex == 0){
 							#We must create a file that we can access for later use
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 								t(.) %>%
 								grep("^ID\\s*$",.) %>%
 								-1
 							Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 							colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 							write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 		 			} else if(soft == FALSE){
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",comment = "#")%>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 					}
 					##Labeling the gene IDs without names
 						geneIDNam <- NAFIXING(geneIDNam)
 					##remove the whitespace
 					geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 					##Here is the clean version
 					write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 				}
 				##Changing the gene ID to gene name
 				ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 				colnames(ALZDAT) = ALZDAT1[1,]
 				##Adjusting the column names aka the gene names
 				colnames(ALZDAT) <- gcnames(ALZDAT)
 				#Full RAW Data
 				Fullalzdwr <- ALZDAT %>%
 					as.data.frame() %>%
 					cbind(ALZWORDF,.)
 				#Raw file is output
 				nfnaex <- strsplit(alz,"[\\]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.) %>%
 					c("GSE",.,"aftexcel.txt") %>%
 					paste(collapse = "")
 				write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 				#Now for the discretization part
 				##get the wordy part again
 				rawword <- t(ALZWORDF)
 				##where is ID_REF located
 				hereim <- grep("ID_REF",rownames(rawword))
 				##Subject Names GSM...
 				subjnam <- rawword[hereim,]
 				##Getting the names for the rows
 				namedarows <- rownames(rawword)[-hereim] %>%
 					as.data.frame()
 				RAWWORD <- rawword[-hereim,] %>%
 					as.data.frame() %>%
 					bind_cols(namedarows,.)
 				z <- 1
 				naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 				for(z in 1:dim(RAWWORD)[1]){
 					naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 					z <- z + 1
 				}
 				colnames(naroww) <- "ROW_NAs"
 				RAWWORD <- bind_cols(RAWWORD,naroww)
 				roALZna <- t(ALZDAT) %>%
 					rownames(.) %>%
 					as.data.frame(.)
 				colnames(roALZna) <- "ID_REF"
 				RAWDAT <- t(ALZDAT) %>%
 					as.data.frame(.)
 				colnames(RAWDAT) <- NULL
 				rownames(RAWDAT) <- NULL
 				RAWDAT2 <- RAWDAT %>%
 					cbind(roALZna,.) %>%
 					dplyr::arrange(.,ID_REF)
 				##Editing the file for R processing
 				RAWDATID <- RAWDAT2[,1] %>%
 					as.matrix(.)
 				RAWDATNUM <- RAWDAT2[,-1] %>%
 					mapply(.,FUN = as.numeric) %>%
 					t(.)
 				##Consolidating genes with the same name
 				###create empty matrix of size equal  to tabRDATID
 				tabRDATID <- table(RAWDATID)
 				NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 				j <- 1
 				for(j in 1:length(tabRDATID)){
 					##Putting the ones without duplicates in their new homes
 					if(tabRDATID[j] == 1){
 						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 					} else if(tabRDATID[j] > 1){
 					##Averaging duplicates and putting them in their new homes
 						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 					}
 					j <- j + 1
 				}
 				##Scaling the Data
 				scrawdat <- NuRDATN%>%
 					scale()
 				attr(scrawdat,"scaled:center") <- NULL
 				attr(scrawdat,"scaled:scale") <- NULL
 				colnames(scrawdat) <- rownames(tabRDATID)
 				##Discretized the Data
 				dialzdat <- scrawdat %>%
 					dndat(.) %>%
 					t()%>%
 					as.data.frame(.)
 				colnames(dialzdat) <- rownames(RAWDATNUM)
 				##setting "ID_REF" as a new variable
 				geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 				colnames(geneNAM) <- "ID_REF"
 				rownames(dialzdat) <- NULL
 				dialzdat <-bind_cols(geneNAM,dialzdat)
 				##NAs in a column
 				x <- 2
 				nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 				nacol[1,1] = "COL_NAs"
 				for(x in 2:dim(dialzdat)[2]){
 					nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 					x <- x + 1
 				}
 				colnames(nacol) <- colnames(dialzdat)
 				dialzdat <- bind_rows(dialzdat,nacol)
 				##NAs in a row
 				y <- 1
 				narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 				for(y in 1:dim(dialzdat)[1]){
 					narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 					y <- y + 1
 				}
 				colnames(narowd) <- "ROW_NAs"
 				dialzdat <- bind_cols(dialzdat,narowd)
 				colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 				colnames(RAWWORD) <- colnames(dialzdat)
 				##converting to character so that the clinical can be brought together with discrete data
 				k <- 2
 				for(k in 2:dim(dialzdat)[2]-1){
 					dialzdat[,k] <- as.character(dialzdat[,k])
 					k <- k + 1
 				}
 				#The End the full data
 				Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 				#Produces Discrete file
 				nfnaex2 <- strsplit(alz,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.) %>%
 					c("GSE",.,"dscrt.txt") %>%
 					paste(collapse = "")
 				write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 				n <- n + 1
 			}
 		}
 	}
 }
 #The Rest of this code will be used every time you want to change a data set
 THEFT()