Efrain Gonzalez / Cleaning and Fixing Data with R

1

########################################################################

1

########################################################################

2

# Don't Use This Code Just Yet #

2

# Don't Use This Code Just Yet #

3

########################################################################

3

########################################################################

4

#Efrain H. Gonzalez

4

#Efrain H. Gonzalez

5

#6/16/2017

5

#6/16/2017

6

7

#Libraries required to run the code

7

#Libraries required to run the code

8

library(pryr)

8

library(pryr)

9

library(MASS)

9

library(MASS)

10

library(dplyr)

10

library(dplyr)

11

library(tidyr)

11

library(tidyr)

12

library(readr)

12

library(readr)

13

library(stringr)

13

library(stringr)

14

15

16

#Necessary Functions

16

#Necessary Functions

17

#1#Function for handling the changing of row names and column names

17

#1#Function for handling the changing of row names and column names

18

chngrownm <- function(mat){

18

chngrownm <- function(mat){

19

row <- dim(mat)[1]

19

row <- dim(mat)[1]

20

col <- dim(mat)[2]

20

col <- dim(mat)[2]

21

e <- 1

21

e <- 1

22

r <- 1

22

r <- 1

23

a <- 1

23

a <- 1

24

h <- 1

24

h <- 1

25

g <- 1

25

g <- 1

26

o <- 1

26

o <- 1

27

for(e in 1:col){

27

for(e in 1:col){

28

if("!Sample_source_name_ch1"==mat[1,e]){

28

if("!Sample_source_name_ch1"==mat[1,e]){

29

colnames(mat)[e] <- "Brain_Region"

29

colnames(mat)[e] <- "Brain_Region"

30

}

30

}

31

else if("!Sample_title" == mat[1,e]){

31

else if("!Sample_title" == mat[1,e]){

32

colnames(mat)[e] <- "Title"

32

colnames(mat)[e] <- "Title"

33

}

33

}

34

else if("!Sample_geo_accession" == mat[1,e]){

34

else if("!Sample_geo_accession" == mat[1,e]){

35

colnames(mat)[e] <- "ID_REF"

35

colnames(mat)[e] <- "ID_REF"

36

} else{

36

} else{

37

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

37

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

38

colnames(mat)[e] <- paste0("Sex",r)

38

colnames(mat)[e] <- paste0("Sex",r)

39

r = r + 1

39

r = r + 1

40

}

40

}

41

else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){

41

else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){

42

colnames(mat)[e] <- paste0("PMI",a)

42

colnames(mat)[e] <- paste0("PMI",a)

43

a = a + 1

43

a = a + 1

44

}

44

}

45

else if(grepl("age|Age|AGE",mat[2,e])==TRUE){

45

else if(grepl("age|Age|AGE",mat[2,e])==TRUE){

46

colnames(mat)[e] <- paste0("Age",h)

46

colnames(mat)[e] <- paste0("Age",h)

47

h = h + 1

47

h = h + 1

48

}

48

}

49

else if(grepl("braak|b&b",mat[2,e])==TRUE){

49

else if(grepl("braak|b&b",mat[2,e])==TRUE){

50

colnames(mat)[e] <- paste0("Braak",g)

50

colnames(mat)[e] <- paste0("Braak",g)

51

g = g + 1

51

g = g + 1

52

}

52

}

53

else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

53

else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

54

colnames(mat)[e] <- paste0("Group",o)

54

colnames(mat)[e] <- paste0("Group",o)

55

o = o + 1

55

o = o + 1

56

}

56

}

57

58

}

58

}

59

e = e + 1

59

e = e + 1

60

}

60

}

61

mat

61

mat

62

}

62

}

63

64

#2#Function for reorganizing information within the columns

64

#2#Function for reorganizing information within the columns

65

cinfo <- function(mat){

65

cinfo <- function(mat){

66

col <- dim(mat)[2]

66

col <- dim(mat)[2]

67

j <-2

67

j <-2

68

for(j in 2:col){

68

for(j in 2:col){

69

if(grepl("Group",colnames(mat)[j]) == TRUE){

69

if(grepl("Group",colnames(mat)[j]) == TRUE){

70

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

70

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

71

}

71

}

72

else if(grepl("Age",colnames(mat)[j])==TRUE){

72

else if(grepl("Age",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub("\\D","",mat[,j])%>%

73

mat[,j] <- gsub("\\D","",mat[,j])%>%

74

as.integer()

74

as.integer()

75

}

75

}

76

else if(grepl("Sex",colnames(mat)[j])==TRUE){

76

else if(grepl("Sex",colnames(mat)[j])==TRUE){

77

mat[,j] <- gsub(".+:\\s","",mat[,j])

77

mat[,j] <- gsub(".+:\\s","",mat[,j])

78

}

78

}

79

else if(grepl("PMI",colnames(mat)[j])==TRUE){

79

else if(grepl("PMI",colnames(mat)[j])==TRUE){

80

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

80

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

81

as.numeric()

81

as.numeric()

82

}

82

}

83

else if(grepl("Braak",colnames(mat)[j])==TRUE){

83

else if(grepl("Braak",colnames(mat)[j])==TRUE){

84

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

84

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

85

as.roman()%>%

85

as.roman()%>%

86

as.integer()

86

as.integer()

87

}

87

}

88

j=j+1

88

j=j+1

89

}

89

}

90

mat

90

mat

91

}

91

}

92

93

#3#Function for labeling the gene IDs without names

93

#3#Function for labeling the gene IDs without names

94

NAFIXING <- function(GIDNAM){

94

NAFIXING <- function(GIDNAM){

95

row <- dim(GIDNAM)[1]

95

row <- dim(GIDNAM)[1]

96

i <- 1

96

i <- 1

97

for(i in 1:row){

97

for(i in 1:row){

98

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

98

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

99

GIDNAM[i,2] <- GIDNAM[i,1]

99

GIDNAM[i,2] <- GIDNAM[i,1]

100

}

100

}

101

i <- i + 1

101

i <- i + 1

102

}

102

}

103

GIDNAM

103

GIDNAM

104

}

104

}

105

106

#4#Function for changing the gene ID to gene name

106

#4#Function for changing the gene ID to gene name

107

cgeneID <- function(GeneName,DATA){

107

cgeneID2 <- function(GeneName,DATA){

108

colGene <- dim(GeneName)[2]

108

nj <- t(GeneName)

109

j <- 1

109

nq <- t(DATA)

110

for(j in 1:colGene){

110

colGene <- dim(nj)[2]

111

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

111

colDATA <- dim(nq)[2]

112

if(is.na(sum(chngsreq))==FALSE){

112

j <- 1

113

if(sum(chngsreq) > 0){

113

for(j in 1:colDATA){

114

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

114

#where is that gene id located within the GPL file

115

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

116

if(is.na(sum(chngreq))==FALSE){

117

if(sum(chngreq) > 0){

118

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

119

}

115

}

120

}

121

j <- j + 1

116

}

122

}

117

j = j+1

123

nq

118

}

119

DATA

120

}

124

}

125

#cgeneID <- function(GeneName,DATA){

126

# colGene <- dim(GeneName)[2]

127

# j <- 1

128

# for(j in 1:colGene){

129

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

130

# if(is.na(sum(chngsreq))==FALSE){

131

# if(sum(chngsreq) > 0){

132

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# }

134

# }

135

# j = j+1

136

# }

137

# DATA

138

#}

121

139

122

#5#Function for adjusting the gene names

140

#5#Function for adjusting the gene names

123

gcnames <- function(DiData,usecol=1){

141

gcnames <- function(DiData,usecol=1){

124

nuruns <- dim(DiData)[2]

142

nuruns <- dim(DiData)[2]

125

i = 1

143

i = 1

126

nwnam <- rep("0",length.out=nuruns)

144

nwnam <- rep("0",length.out=nuruns)

127

for(i in 1:nuruns){

145

for(i in 1:nuruns){

128

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

146

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

129

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

147

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

130

} else{

148

} else{

131

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

149

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

132

}

150

}

133

151

134

}

152

}

135

nwnam

153

nwnam

136

154

137

}

155

}

138

156

139

#6# Function for discretizing the data

157

#6# Function for discretizing the data

140

dndat <- function(NDATA){

158

dndat <- function(NDATA){

141

rownd <- dim(NDATA)[1]

159

rownd <- dim(NDATA)[1]

142

colnd <- dim(NDATA)[2]

160

colnd <- dim(NDATA)[2]

143

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

161

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

144

colnames(DDATA) <- colnames(NDATA)

162

colnames(DDATA) <- colnames(NDATA)

145

i <- 1

163

i <- 1

146

for(i in 1:rownd){

164

for(i in 1:rownd){

147

j <- 1

165

j <- 1

148

for(j in 1:colnd){

166

for(j in 1:colnd){

149

if(is.na(NDATA[i,j])==FALSE){

167

if(is.na(NDATA[i,j])==FALSE){

150

168

151

if(NDATA[i,j] < -1){

169

if(NDATA[i,j] < -1){

152

DDATA[i,j]=0L

170

DDATA[i,j]=0L

153

}

171

}

154

if(NDATA[i,j] > 1){

172

if(NDATA[i,j] > 1){

155

DDATA[i,j]=2L

173

DDATA[i,j]=2L

156

}

174

}

157

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

175

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

158

DDATA[i,j]=1L

176

DDATA[i,j]=1L

159

}

177

}

160

} else{

178

} else{

161

DDATA[i,j] = NDATA[i,j]

179

DDATA[i,j] = NDATA[i,j]

162

}

180

}

163

j = j + 1

181

j = j + 1

164

}

182

}

165

i = i + 1

183

i = i + 1

166

}

184

}

167

DDATA

185

DDATA

168

}

186

}

169

187

170

188

171

#MajorFunction#This is the function that does everything else

189

#MajorFunction#This is the function that does everything else

172

THEFT <- function(){

190

THEFT <- function(){

173

#Set working directory based on the directory of the series matrix file Currently only works for windows

191

#Set working directory based on the directory of the series matrix file Currently only works for windows

174

wd <- getwd()

192

wd <- getwd()

175

#list.files()

193

#list.files()

176

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

194

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

177

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

195

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

178

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

196

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

179

GSEfloc <- list.files()[GSEfileloc]

197

GSEfloc <- list.files()[GSEfileloc]

180

#ALL DATA FILES WILL BE CLEANED

198

#ALL DATA FILES WILL BE CLEANED

181

if(numDAT == 1){

199

if(numDAT == 1){

182

#indexing the data files

200

#indexing the data files

183

n <- 1

201

n <- 1

184

for(n in 1: length(GSEfloc)){

202

for(n in 1: length(GSEfloc)){

185

alz <- GSEfloc[n]

203

alz <- GSEfloc[n]

186

204

187

#Working with the wordy part of the document

205

#Working with the wordy part of the document

188

alzword <- alz %>%

206

alzword <- alz %>%

189

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

207

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

190

filter(grepl("!Sample",X1))%>%

208

filter(grepl("!Sample",X1))%>%

191

filter(!grepl("!Sample_contact",X1))

209

filter(!grepl("!Sample_contact",X1))

192

210

193

#Getting the GPL file

211

#Getting the GPL file

194

genena <- grep("_platform_id",alzword$X1) %>%

212

genena <- grep("_platform_id",alzword$X1) %>%

195

alzword$X2[.] %>%

213

alzword$X2[.] %>%

196

str_trim(.) %>%

214

str_trim(.) %>%

197

paste0("^",.,"\\D") %>%

215

paste0("^",.,"\\D") %>%

198

grep(.,list.files()) %>%

216

grep(.,list.files()) %>%

199

list.files()[.]

217

list.files()[.]

200

218

201

#Find out if it is a soft GPL file or not

219

#Find out if it is a soft GPL file or not

202

soft <- strsplit(genena,"[\\|/]") %>%

220

soft <- strsplit(genena,"[\\|/]") %>%

203

.[[1]] %>%

221

.[[1]] %>%

204

.[length(.)] %>%

222

.[length(.)] %>%

205

grepl("soft",.)

223

grepl("soft",.)

206

224

207

##Changing row names and column names:

225

##Changing row names and column names:

208

ALZWORD <- t(alzword)

226

ALZWORD <- t(alzword)

209

rownames(ALZWORD)=NULL

227

rownames(ALZWORD)=NULL

210

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

228

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

211

ALZWORD <- chngrownm(ALZWORD)[-1,]

229

ALZWORD <- chngrownm(ALZWORD)[-1,]

212

ALZWORD <- ALZWORD%>%

230

ALZWORD <- ALZWORD%>%

213

as.data.frame()%>%

231

as.data.frame()%>%

214

dplyr::select(-starts_with("col"))

232

dplyr::select(-starts_with("col"))

215

233

216

##Reorganizing information within the columns and final clinical data

234

##Reorganizing information within the columns and final clinical data

217

ALZWORDF <- cinfo(ALZWORD)

235

ALZWORDF <- cinfo(ALZWORD)

218

236

219

237

220

#Working with Actual Data part of file

238

#Working with Actual Data part of file

221

alzdat <- alz %>%

239

alzdat <- alz %>%

222

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

240

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

223

ALZDAT <- t(alzdat[,-1])

241

ALZDAT <- t(alzdat[,-1])

224

rownames(ALZDAT)=NULL

242

rownames(ALZDAT)=NULL

225

243

226

##Is there a clean version of the GPL file available?

244

##Is there a clean version of the GPL file available?

227

gplnum <- strsplit(genena,"[\\|/]") %>%

245

gplnum <- strsplit(genena,"[\\|/]") %>%

228

.[[1]] %>%

246

.[[1]] %>%

229

.[length(.)] %>%

247

.[length(.)] %>%

230

gsub("\\D","",.)

248

gsub("\\D","",.)

231

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

249

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

232

if(clfileex >= 1){

250

if(clfileex >= 1){

233

#use the clean version

251

#use the clean version

234

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

252

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

235

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

253

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

236

254

237

}

255

}

238

else if(clfileex == 0){

256

else if(clfileex == 0){

239

##Lets Create a clean version

257

##Lets Create a clean version

240

258

241

##Gene ID to Gene Name

259

##Gene ID to Gene Name

242

if(soft == TRUE){

260

if(soft == TRUE){

243

#Check to see if there is already a file containing information on soft files

261

#Check to see if there is already a file containing information on soft files

244

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

262

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

245

if(fileex == 1){

263

if(fileex == 1){

246

#Check to see if this GPL soft file has been used before

264

#Check to see if this GPL soft file has been used before

247

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

265

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

248

.$GPL_FILE_NUM%>%

266

.$GPL_FILE_NUM%>%

249

grepl(gplnum,.) %>%

267

grepl(gplnum,.) %>%

250

sum()

268

sum()

251

if(IDF == 1){

269

if(IDF == 1){

252

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

270

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

253

.$GPL_FILE_NUM%>%

271

.$GPL_FILE_NUM%>%

254

grep(gplnum,.)

272

grep(gplnum,.)

255

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

273

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

256

.$LOC_ID %>%

274

.$LOC_ID %>%

257

.[IDLOCAL]

275

.[IDLOCAL]

258

geneIDNam <- genena %>%

276

geneIDNam <- genena %>%

259

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

277

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

260

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

278

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

261

}

279

}

262

else if(IDF == 0){

280

else if(IDF == 0){

263

#No information on this particular GPL file

281

#No information on this particular GPL file

264

idLOCGPL <- genena %>%

282

idLOCGPL <- genena %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

283

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

266

t(.) %>%

284

t(.) %>%

267

grep("^ID\\s*$",.) %>%

285

grep("^ID\\s*$",.) %>%

268

-1

286

-1

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

287

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

288

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

271

geneIDNam <- genena %>%

289

geneIDNam <- genena %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

290

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

291

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

274

}

292

}

275

}

293

}

276

else if(fileex == 0){

294

else if(fileex == 0){

277

#We must create a file that we can access for later use

295

#We must create a file that we can access for later use

278

idLOCGPL <- genena %>%

296

idLOCGPL <- genena %>%

279

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

297

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

280

t(.) %>%

298

t(.) %>%

281

grep("^ID\\s*$",.) %>%

299

grep("^ID\\s*$",.) %>%

282

-1

300

-1

283

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

301

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

284

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

302

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

285

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

303

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

286

geneIDNam <- genena %>%

304

geneIDNam <- genena %>%

287

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

305

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

288

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

306

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

289

}

307

}

290

}

308

}

291

else if(soft == FALSE){

309

else if(soft == FALSE){

292

geneIDNam <- genena %>%

310

geneIDNam <- genena %>%

293

read_delim(delim="\t",comment = "#")%>%

311

read_delim(delim="\t",comment = "#")%>%

294

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

312

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

295

}

313

}

296

314

297

##Labeling the gene IDs without names

315

##Labeling the gene IDs without names

298

geneIDNam <- NAFIXING(geneIDNam)

316

geneIDNam <- NAFIXING(geneIDNam)

299

317

300

##remove the whitespace

318

##remove the whitespace

301

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

319

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

302

320

303

##Here is the clean version

321

##Here is the clean version

304

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

322

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

305

}

323

}

306

324

307

325

308

326

309

##Changing the gene ID to gene name

327

##Changing the gene ID to gene name

310

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

328

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

311

colnames(ALZDAT) = ALZDAT1[1,]

329

colnames(ALZDAT) = ALZDAT1[1,]

312

330

313

331

314

##Adjusting the column names aka the gene names

332

##Adjusting the column names aka the gene names

315

colnames(ALZDAT) <- gcnames(ALZDAT)

333

colnames(ALZDAT) <- gcnames(ALZDAT)

316

334

317

335

318

#Full RAW Data

336

#Full RAW Data

319

Fullalzdwr <- ALZDAT %>%

337

Fullalzdwr <- ALZDAT %>%

320

as.data.frame() %>%

338

as.data.frame() %>%

321

cbind(ALZWORDF,.)

339

cbind(ALZWORDF,.)

322

340

323

#Raw file is output

341

#Raw file is output

324

nfnaex <- strsplit(alz,"[\\]") %>%

342

nfnaex <- strsplit(alz,"[\\]") %>%

325

.[[1]] %>%

343

.[[1]] %>%

326

.[length(.)] %>%

344

.[length(.)] %>%

327

gsub("\\D","",.) %>%

345

gsub("\\D","",.) %>%

328

c("GSE",.,"aftexcel.txt") %>%

346

c("GSE",.,"aftexcel.txt") %>%

329

paste(collapse = "")

347

paste(collapse = "")

330

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

348

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

331

349

332

350

333

351

334

#Now for the discretization part

352

#Now for the discretization part

335

##get the wordy part again

353

##get the wordy part again

336

rawword <- t(ALZWORDF)

354

rawword <- t(ALZWORDF)

337

355

338

##where is ID_REF located

356

##where is ID_REF located

339

hereim <- grep("ID_REF",rownames(rawword))

357

hereim <- grep("ID_REF",rownames(rawword))

340

358

341

##Subject Names GSM...

359

##Subject Names GSM...

342

subjnam <- rawword[hereim,]

360

subjnam <- rawword[hereim,]

343

361

344

##Getting the names for the rows

362

##Getting the names for the rows

345

namedarows <- rownames(rawword)[-hereim] %>%

363

namedarows <- rownames(rawword)[-hereim] %>%

346

as.data.frame()

364

as.data.frame()

347

RAWWORD <- rawword[-hereim,] %>%

365

RAWWORD <- rawword[-hereim,] %>%

348

as.data.frame() %>%

366

as.data.frame() %>%

349

bind_cols(namedarows,.)

367

bind_cols(namedarows,.)

350

z <- 1

368

z <- 1

351

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

369

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

352

for(z in 1:dim(RAWWORD)[1]){

370

for(z in 1:dim(RAWWORD)[1]){

353

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

371

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

354

z <- z + 1

372

z <- z + 1

355

}

373

}

356

374

357

colnames(naroww) <- "ROW_NAs"

375

colnames(naroww) <- "ROW_NAs"

358

RAWWORD <- bind_cols(RAWWORD,naroww)

376

RAWWORD <- bind_cols(RAWWORD,naroww)

359

377

360

378

361

roALZna <- t(ALZDAT) %>%

379

roALZna <- t(ALZDAT) %>%

362

rownames(.) %>%

380

rownames(.) %>%

363

as.data.frame(.)

381

as.data.frame(.)

364

colnames(roALZna) <- "ID_REF"

382

colnames(roALZna) <- "ID_REF"

365

383

366

RAWDAT <- t(ALZDAT) %>%

384

RAWDAT <- t(ALZDAT) %>%

367

as.data.frame(.)

385

as.data.frame(.)

368

colnames(RAWDAT) <- NULL

386

colnames(RAWDAT) <- NULL

369

rownames(RAWDAT) <- NULL

387

rownames(RAWDAT) <- NULL

370

388

371

RAWDAT2 <- RAWDAT %>%

389

RAWDAT2 <- RAWDAT %>%

372

cbind(roALZna,.) %>%

390

cbind(roALZna,.) %>%

373

dplyr::arrange(.,ID_REF)

391

dplyr::arrange(.,ID_REF)

374

392

375

##Editing the file for R processing

393

##Editing the file for R processing

376

RAWDATID <- RAWDAT2[,1] %>%

394

RAWDATID <- RAWDAT2[,1] %>%

377

as.matrix(.)

395

as.matrix(.)

378

396

379

RAWDATNUM <- RAWDAT2[,-1] %>%

397

RAWDATNUM <- RAWDAT2[,-1] %>%

380

mapply(.,FUN = as.numeric) %>%

398

mapply(.,FUN = as.numeric) %>%

381

t(.)

399

t(.)

382

400

383

##Consolidating genes with the same name

401

##Consolidating genes with the same name

384

###create empty matrix of size equal to tabRDATID

402

###create empty matrix of size equal to tabRDATID

385

tabRDATID <- table(RAWDATID)

403

tabRDATID <- table(RAWDATID)

386

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

404

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

387

j <- 1

405

j <- 1

388

for(j in 1:length(tabRDATID)){

406

for(j in 1:length(tabRDATID)){

389

##Putting the ones without duplicates in their new homes

407

##Putting the ones without duplicates in their new homes

390

if(tabRDATID[j] == 1){

408

if(tabRDATID[j] == 1){

391

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

409

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

392

}

410

}

393

##Averaging duplicates and putting them in their new homes

411

##Averaging duplicates and putting them in their new homes

394

else if(tabRDATID[j] > 1){

412

else if(tabRDATID[j] > 1){

395

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

413

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

396

}

414

}

397

j <- j + 1

415

j <- j + 1

398

}

416

}

399

417

400

##Scaling the Data

418

##Scaling the Data

401

scrawdat <- NuRDATN%>%

419

scrawdat <- NuRDATN%>%

402

scale()

420

scale()

403

attr(scrawdat,"scaled:center") <- NULL

421

attr(scrawdat,"scaled:center") <- NULL

404

attr(scrawdat,"scaled:scale") <- NULL

422

attr(scrawdat,"scaled:scale") <- NULL

405

colnames(scrawdat) <- rownames(tabRDATID)

423

colnames(scrawdat) <- rownames(tabRDATID)

406

424

407

##Discretized the Data

425

##Discretized the Data

408

dialzdat <- scrawdat %>%

426

dialzdat <- scrawdat %>%

409

dndat(.) %>%

427

dndat(.) %>%

410

t()%>%

428

t()%>%

411

as.data.frame(.)

429

as.data.frame(.)

412

colnames(dialzdat) <- rownames(RAWDATNUM)

430

colnames(dialzdat) <- rownames(RAWDATNUM)

413

431

414

##setting "ID_REF" as a new variable

432

##setting "ID_REF" as a new variable

415

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

433

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

416

colnames(geneNAM) <- "ID_REF"

434

colnames(geneNAM) <- "ID_REF"

417

rownames(dialzdat) <- NULL

435

rownames(dialzdat) <- NULL

418

dialzdat <-bind_cols(geneNAM,dialzdat)

436

dialzdat <-bind_cols(geneNAM,dialzdat)

419

437

420

##NAs in a column

438

##NAs in a column

421

x <- 2

439

x <- 2

422

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

440

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

423

nacol[1,1] = "COL_NAs"

441

nacol[1,1] = "COL_NAs"

424

for(x in 2:dim(dialzdat)[2]){

442

for(x in 2:dim(dialzdat)[2]){

425

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

443

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

426

x <- x + 1

444

x <- x + 1

427

}

445

}

428

colnames(nacol) <- colnames(dialzdat)

446

colnames(nacol) <- colnames(dialzdat)

429

dialzdat <- bind_rows(dialzdat,nacol)

447

dialzdat <- bind_rows(dialzdat,nacol)

430

448

431

##NAs in a row

449

##NAs in a row

432

y <- 1

450

y <- 1

433

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

451

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

434

for(y in 1:dim(dialzdat)[1]){

452

for(y in 1:dim(dialzdat)[1]){

435

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

453

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

436

y <- y + 1

454

y <- y + 1

437

}

455

}

438

colnames(narowd) <- "ROW_NAs"

456

colnames(narowd) <- "ROW_NAs"

439

dialzdat <- bind_cols(dialzdat,narowd)

457

dialzdat <- bind_cols(dialzdat,narowd)

440

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

458

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

441

colnames(RAWWORD) <- colnames(dialzdat)

459

colnames(RAWWORD) <- colnames(dialzdat)

442

##converting to character so that the clinical can be brought together with discrete data

460

##converting to character so that the clinical can be brought together with discrete data

443

k <- 2

461

k <- 2

444

for(k in 2:dim(dialzdat)[2]-1){

462

for(k in 2:dim(dialzdat)[2]-1){

445

dialzdat[,k] <- as.character(dialzdat[,k])

463

dialzdat[,k] <- as.character(dialzdat[,k])

446

k <- k + 1

464

k <- k + 1

447

}

465

}

448

#The End the full data

466

#The End the full data

449

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

467

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

450

468

451

#Produces Discrete file

469

#Produces Discrete file

452

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

470

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

453

.[[1]] %>%

471

.[[1]] %>%

454

.[length(.)] %>%

472

.[length(.)] %>%

455

gsub("\\D","",.) %>%

473

gsub("\\D","",.) %>%

456

c("GSE",.,"dscrt.txt") %>%

474

c("GSE",.,"dscrt.txt") %>%

457

paste(collapse = "")

475

paste(collapse = "")

458

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

476

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

459

n <- n +1

477

n <- n +1

460

}

478

}

461

}

479

}

462

480

463

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

481

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

464

else if(numDAT == 2){

482

else if(numDAT == 2){

465

#All the files you want to analyze

483

#All the files you want to analyze

466

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

484

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

467

if(length(ANDIS) == 0){

485

if(length(ANDIS) == 0){

468

#Spit out a warning

486

#Spit out a warning

469

warning("You did not select any files and so no cleaning will be performed")

487

warning("You did not select any files and so no cleaning will be performed")

470

} else{

488

} else{

471

#indexing the data files

489

#indexing the data files

472

n <- 1

490

n <- 1

473

for(n in 1: length(ANDIS)){

491

for(n in 1: length(ANDIS)){

474

alz <- ANDIS[n]

492

alz <- ANDIS[n]

475

493

476

#Working with the wordy part of the document

494

#Working with the wordy part of the document

477

alzword <- alz %>%

495

alzword <- alz %>%

478

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

496

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

479

filter(grepl("!Sample",X1))%>%

497

filter(grepl("!Sample",X1))%>%

480

filter(!grepl("!Sample_contact",X1))

498

filter(!grepl("!Sample_contact",X1))

481

499

482

#Getting the GPL file

500

#Getting the GPL file

483

genena <- grep("_platform_id",alzword$X1) %>%

501

genena <- grep("_platform_id",alzword$X1) %>%

484

alzword$X2[.] %>%

502

alzword$X2[.] %>%

485

str_trim(.) %>%

503

str_trim(.) %>%

486

paste0("^",.,"\\D") %>%

504

paste0("^",.,"\\D") %>%

487

grep(.,list.files()) %>%

505

grep(.,list.files()) %>%

488

list.files()[.]

506

list.files()[.]

489

507

490

#Find out if it is a soft GPL file or not

508

#Find out if it is a soft GPL file or not

491

soft <- strsplit(genena,"[\\|/]") %>%

509

soft <- strsplit(genena,"[\\|/]") %>%

492

.[[1]] %>%

510

.[[1]] %>%

493

.[length(.)] %>%

511

.[length(.)] %>%

494

grepl("soft",.)

512

grepl("soft",.)

495

513

496

##Changing row names and column names:

514

##Changing row names and column names:

497

ALZWORD <- t(alzword)

515

ALZWORD <- t(alzword)

498

rownames(ALZWORD)=NULL

516

rownames(ALZWORD)=NULL

499

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

517

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

500

ALZWORD <- chngrownm(ALZWORD)[-1,]

518

ALZWORD <- chngrownm(ALZWORD)[-1,]

501

ALZWORD <- ALZWORD%>%

519

ALZWORD <- ALZWORD%>%

502

as.data.frame()%>%

520

as.data.frame()%>%

503

dplyr::select(-starts_with("col"))

521

dplyr::select(-starts_with("col"))

504

522

505

##Reorganizing information within the columns and final clinical data

523

##Reorganizing information within the columns and final clinical data

506

ALZWORDF <- cinfo(ALZWORD)

524

ALZWORDF <- cinfo(ALZWORD)

507

525

508

526

509

#Working with Actual Data part of file

527

#Working with Actual Data part of file

510

alzdat <- alz %>%

528

alzdat <- alz %>%

511

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

529

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

512

ALZDAT <- t(alzdat[,-1])

530

ALZDAT <- t(alzdat[,-1])

513

rownames(ALZDAT)=NULL

531

rownames(ALZDAT)=NULL

514

532

515

##Is there a clean version of the GPL file available?

533

##Is there a clean version of the GPL file available?

516

gplnum <- strsplit(genena,"[\\|/]") %>%

534

gplnum <- strsplit(genena,"[\\|/]") %>%

517

.[[1]] %>%

535

.[[1]] %>%

518

.[length(.)] %>%

536

.[length(.)] %>%

519

gsub("\\D","",.)

537

gsub("\\D","",.)

520

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

538

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

521

if(clfileex >= 1){

539

if(clfileex >= 1){

522

#use the clean version

540

#use the clean version

523

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

541

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

524

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

542

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

525

543

526

}

544

}

527

else if(clfileex == 0){

545

else if(clfileex == 0){

528

##Lets Create a clean version

546

##Lets Create a clean version

529

547

530

##Gene ID to Gene Name

548

##Gene ID to Gene Name

531

if(soft == TRUE){

549

if(soft == TRUE){

532

#Check to see if there is already a file containing information on soft files

550

#Check to see if there is already a file containing information on soft files

533

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

551

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

534

if(fileex == 1){

552

if(fileex == 1){

535

#Check to see if this GPL soft file has been used before

553

#Check to see if this GPL soft file has been used before

536

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

554

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

537

.$GPL_FILE_NUM%>%

555

.$GPL_FILE_NUM%>%

538

grepl(gplnum,.) %>%

556

grepl(gplnum,.) %>%

539

sum()

557

sum()

540

if(IDF == 1){

558

if(IDF == 1){

541

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

559

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

542

.$GPL_FILE_NUM%>%

560

.$GPL_FILE_NUM%>%

543

grep(gplnum,.)

561

grep(gplnum,.)

544

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

562

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

545

.$LOC_ID %>%

563

.$LOC_ID %>%

546

.[IDLOCAL]

564

.[IDLOCAL]

547

geneIDNam <- genena %>%

565

geneIDNam <- genena %>%

548

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

566

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

549

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

567

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

550

}

568

}

551

else if(IDF == 0){

569

else if(IDF == 0){

552

#No information on this particular GPL file

570

#No information on this particular GPL file

553

idLOCGPL <- genena %>%

571

idLOCGPL <- genena %>%

554

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

555

t(.) %>%

573

t(.) %>%

556

grep("^ID\\s*$",.) %>%

574

grep("^ID\\s*$",.) %>%

557

-1

575

-1

558

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

576

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

559

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

577

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

560

geneIDNam <- genena %>%

578

geneIDNam <- genena %>%

561

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

579

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

562

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

580

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

563

}

581

}

564

}

582

}

565

else if(fileex == 0){

583

else if(fileex == 0){

566

#We must create a file that we can access for later use

584

#We must create a file that we can access for later use

567

idLOCGPL <- genena %>%

585

idLOCGPL <- genena %>%

568

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

586

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

569

t(.) %>%

587

t(.) %>%

570

grep("^ID\\s*$",.) %>%

588

grep("^ID\\s*$",.) %>%

571

-1

589

-1

572

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

590

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

573

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

591

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

574

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

592

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

575

geneIDNam <- genena %>%

593

geneIDNam <- genena %>%

576

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

594

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

577

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

595

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

578

}

596

}

579

}

597

}

580

else if(soft == FALSE){

598

else if(soft == FALSE){

581

geneIDNam <- genena %>%

599

geneIDNam <- genena %>%

582

read_delim(delim="\t",comment = "#")%>%

600

read_delim(delim="\t",comment = "#")%>%

583

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

601

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

584

}

602

}

585

603

586

##Labeling the gene IDs without names

604

##Labeling the gene IDs without names

587

geneIDNam <- NAFIXING(geneIDNam)

605

geneIDNam <- NAFIXING(geneIDNam)

588

606

589

##remove the whitespace

607

##remove the whitespace

590

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

608

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

591

609

592

##Here is the clean version

610

##Here is the clean version

593

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

611

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

594

}

612

}

595

613

596

614

597

615

598

##Changing the gene ID to gene name

616

##Changing the gene ID to gene name

599

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

617

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

600

colnames(ALZDAT) = ALZDAT1[1,]

618

colnames(ALZDAT) = ALZDAT1[1,]

601

619

602

620

603

##Adjusting the column names aka the gene names

621

##Adjusting the column names aka the gene names

604

colnames(ALZDAT) <- gcnames(ALZDAT)

622

colnames(ALZDAT) <- gcnames(ALZDAT)

605

623

606

624

607

#Full RAW Data

625

#Full RAW Data

608

Fullalzdwr <- ALZDAT %>%

626

Fullalzdwr <- ALZDAT %>%

609

as.data.frame() %>%

627

as.data.frame() %>%

610

cbind(ALZWORDF,.)

628

cbind(ALZWORDF,.)

611

629

612

#Raw file is output

630

#Raw file is output

613

nfnaex <- strsplit(alz,"[\\]") %>%

631

nfnaex <- strsplit(alz,"[\\]") %>%

614

.[[1]] %>%

632

.[[1]] %>%

615

.[length(.)] %>%

633

.[length(.)] %>%

616

gsub("\\D","",.) %>%

634

gsub("\\D","",.) %>%

617

c("GSE",.,"aftexcel.txt") %>%

635

c("GSE",.,"aftexcel.txt") %>%

618

paste(collapse = "")

636

paste(collapse = "")

619

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

637

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

620

638

621

639

622

640

623

#Now for the discretization part

641

#Now for the discretization part

624

##get the wordy part again

642

##get the wordy part again

625

rawword <- t(ALZWORDF)

643

rawword <- t(ALZWORDF)

626

644

627

##where is ID_REF located

645

##where is ID_REF located

628

hereim <- grep("ID_REF",rownames(rawword))

646

hereim <- grep("ID_REF",rownames(rawword))

629

647

630

##Subject Names GSM...

648

##Subject Names GSM...

631

subjnam <- rawword[hereim,]

649

subjnam <- rawword[hereim,]

632

650

633

##Getting the names for the rows

651

##Getting the names for the rows

634

namedarows <- rownames(rawword)[-hereim] %>%

652

namedarows <- rownames(rawword)[-hereim] %>%

635

as.data.frame()

653

as.data.frame()

636

RAWWORD <- rawword[-hereim,] %>%

654

RAWWORD <- rawword[-hereim,] %>%

637

as.data.frame() %>%

655

as.data.frame() %>%

638

bind_cols(namedarows,.)

656

bind_cols(namedarows,.)

639

z <- 1

657

z <- 1

640

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

658

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

641

for(z in 1:dim(RAWWORD)[1]){

659

for(z in 1:dim(RAWWORD)[1]){

642

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

660

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

643

z <- z + 1

661

z <- z + 1

644

}

662

}

645

663

646

colnames(naroww) <- "ROW_NAs"

664

colnames(naroww) <- "ROW_NAs"

647

RAWWORD <- bind_cols(RAWWORD,naroww)

665

RAWWORD <- bind_cols(RAWWORD,naroww)

648

666

649

667

650

roALZna <- t(ALZDAT) %>%

668

roALZna <- t(ALZDAT) %>%

651

rownames(.) %>%

669

rownames(.) %>%

652

as.data.frame(.)

670

as.data.frame(.)

653

colnames(roALZna) <- "ID_REF"

671

colnames(roALZna) <- "ID_REF"

654

672

655

RAWDAT <- t(ALZDAT) %>%

673

RAWDAT <- t(ALZDAT) %>%

656

as.data.frame(.)

674

as.data.frame(.)

657

colnames(RAWDAT) <- NULL

675

colnames(RAWDAT) <- NULL

658

rownames(RAWDAT) <- NULL

676

rownames(RAWDAT) <- NULL

659

677

660

RAWDAT2 <- RAWDAT %>%

678

RAWDAT2 <- RAWDAT %>%

661

cbind(roALZna,.) %>%

679

cbind(roALZna,.) %>%

662

dplyr::arrange(.,ID_REF)

680

dplyr::arrange(.,ID_REF)

663

681

664

##Editing the file for R processing

682

##Editing the file for R processing

665

RAWDATID <- RAWDAT2[,1] %>%

683

RAWDATID <- RAWDAT2[,1] %>%

666

as.matrix(.)

684

as.matrix(.)

667

685

668

RAWDATNUM <- RAWDAT2[,-1] %>%

686

RAWDATNUM <- RAWDAT2[,-1] %>%

669

mapply(.,FUN = as.numeric) %>%

687

mapply(.,FUN = as.numeric) %>%

670

t(.)

688

t(.)

671

689

672

##Consolidating genes with the same name

690

##Consolidating genes with the same name

673

###create empty matrix of size equal to tabRDATID

691

###create empty matrix of size equal to tabRDATID

674

tabRDATID <- table(RAWDATID)

692

tabRDATID <- table(RAWDATID)

675

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

693

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

676

j <- 1

694

j <- 1

677

for(j in 1:length(tabRDATID)){

695

for(j in 1:length(tabRDATID)){

678

##Putting the ones without duplicates in their new homes

696

##Putting the ones without duplicates in their new homes

679

if(tabRDATID[j] == 1){

697

if(tabRDATID[j] == 1){

680

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

698

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

681

}

699

}

682

##Averaging duplicates and putting them in their new homes

700

##Averaging duplicates and putting them in their new homes

683

else if(tabRDATID[j] > 1){

701

else if(tabRDATID[j] > 1){

684

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

702

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

685

}

703

}

686

j <- j + 1

704

j <- j + 1

687

}

705

}

688

706

689

##Scaling the Data

707

##Scaling the Data

690

scrawdat <- NuRDATN%>%

708

scrawdat <- NuRDATN%>%

691

scale()

709

scale()

692

attr(scrawdat,"scaled:center") <- NULL

710

attr(scrawdat,"scaled:center") <- NULL

693

attr(scrawdat,"scaled:scale") <- NULL

711

attr(scrawdat,"scaled:scale") <- NULL

694

colnames(scrawdat) <- rownames(tabRDATID)

712

colnames(scrawdat) <- rownames(tabRDATID)

695

713

696

##Discretized the Data

714

##Discretized the Data

697

dialzdat <- scrawdat %>%

715

dialzdat <- scrawdat %>%

698

dndat(.) %>%

716

dndat(.) %>%

699

t()%>%

717

t()%>%

700

as.data.frame(.)

718

as.data.frame(.)

701

colnames(dialzdat) <- rownames(RAWDATNUM)

719

colnames(dialzdat) <- rownames(RAWDATNUM)

702

720

703

##setting "ID_REF" as a new variable

721

##setting "ID_REF" as a new variable

704

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

722

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

705

colnames(geneNAM) <- "ID_REF"

723

colnames(geneNAM) <- "ID_REF"

706

rownames(dialzdat) <- NULL

724

rownames(dialzdat) <- NULL

707

dialzdat <-bind_cols(geneNAM,dialzdat)

725

dialzdat <-bind_cols(geneNAM,dialzdat)

708

726

709

##NAs in a column

727

##NAs in a column

710

x <- 2

728

x <- 2

711

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

729

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

712

nacol[1,1] = "COL_NAs"

730

nacol[1,1] = "COL_NAs"

713

for(x in 2:dim(dialzdat)[2]){

731

for(x in 2:dim(dialzdat)[2]){

714

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

732

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

715

x <- x + 1

733

x <- x + 1

716

}

734

}

717

colnames(nacol) <- colnames(dialzdat)

735

colnames(nacol) <- colnames(dialzdat)

718

dialzdat <- bind_rows(dialzdat,nacol)

736

dialzdat <- bind_rows(dialzdat,nacol)

719

737

720

##NAs in a row

738

##NAs in a row

721

y <- 1

739

y <- 1

722

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

740

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

723

for(y in 1:dim(dialzdat)[1]){

741

for(y in 1:dim(dialzdat)[1]){

724

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

742

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

725

y <- y + 1

743

y <- y + 1

726

}

744

}

727

colnames(narowd) <- "ROW_NAs"

745

colnames(narowd) <- "ROW_NAs"

728

dialzdat <- bind_cols(dialzdat,narowd)

746

dialzdat <- bind_cols(dialzdat,narowd)

729

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

747

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

730

colnames(RAWWORD) <- colnames(dialzdat)

748

colnames(RAWWORD) <- colnames(dialzdat)

731

##converting to character so that the clinical can be brought together with discrete data

749

##converting to character so that the clinical can be brought together with discrete data

732

k <- 2

750

k <- 2

733

for(k in 2:dim(dialzdat)[2]-1){

751

for(k in 2:dim(dialzdat)[2]-1){

734

dialzdat[,k] <- as.character(dialzdat[,k])

752

dialzdat[,k] <- as.character(dialzdat[,k])

735

k <- k + 1

753

k <- k + 1

736

}

754

}

737

#The End the full data

755

#The End the full data

738

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

756

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

739

757

740

#Produces Discrete file

758

#Produces Discrete file

741

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

759

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

742

.[[1]] %>%

760

.[[1]] %>%

743

.[length(.)] %>%

761

.[length(.)] %>%

744

gsub("\\D","",.) %>%

762

gsub("\\D","",.) %>%

745

c("GSE",.,"dscrt.txt") %>%

763

c("GSE",.,"dscrt.txt") %>%

746

paste(collapse = "")

764

paste(collapse = "")

747

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

765

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

748

766

749

767

750

n <- n + 1

768

n <- n + 1

751

}

769

}

752

}

770

}

753

}

771

}

754

}

772

}

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Don't use this code yet

 ########################################################################
 #               Don't Use This Code Just Yet                           #
 ########################################################################
 #Efrain H. Gonzalez
 #6/16/2017
 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	e <- 1
 	r <- 1
 	a <- 1
 	h <- 1
 	g <- 1
 	o <- 1
 	for(e in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,e]){
 			colnames(mat)[e] <- "Brain_Region"
 		}
 		else if("!Sample_title" == mat[1,e]){
 			colnames(mat)[e] <- "Title"
 		}
 		else if("!Sample_geo_accession" == mat[1,e]){
 			colnames(mat)[e] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Sex",r)
 				r = r + 1
 			}
 			else if(grepl("postmorteminterval|PMI|pmi",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("PMI",a)
 				a = a + 1
 			}
 			else if(grepl("age|Age|AGE",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Age",h)
 				h = h + 1
 			 }
 			else if(grepl("braak|b&b",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Braak",g)
 				g = g + 1
 			}
 			else if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Group",o)
 				o = o + 1
 			}
 		}
 		e = e + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
 		else if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
 		else if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
 		else if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		}
 		else if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
-cgeneID <- function(GeneName,DATA){
+cgeneID2 <- function(GeneName,DATA){
-    colGene <- dim(GeneName)[2]
+	nj <- t(GeneName)
-     j <- 1
+	nq <- t(DATA)
-     for(j in 1:colGene){
+	colGene <- dim(nj)[2]
-	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+	colDATA <- dim(nq)[2]
-	if(is.na(sum(chngsreq))==FALSE){
+	j <- 1
-		if(sum(chngsreq) > 0){
+	for(j in 1:colDATA){
-			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+		#where is that gene id located within the GPL file
+		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
+		if(is.na(sum(chngreq))==FALSE){
+			if(sum(chngreq) > 0){
+			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
+			}
 		}
+		j <- j + 1
 	}
-	j = j+1
+	nq
-	}
-	DATA
 }
+#cgeneID <- function(GeneName,DATA){
+#    colGene <- dim(GeneName)[2]
+#     j <- 1
+#     for(j in 1:colGene){
+#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+#	if(is.na(sum(chngsreq))==FALSE){
+#		if(sum(chngsreq) > 0){
+#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		}
+#	}
+#	j = j+1
+#	}
+#	DATA
+#}
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
 		} else{
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
 		}
 	}
 	nwnam
 }
 #6# Function for discretizing the data
 dndat <- function(NDATA){
 	rownd <- dim(NDATA)[1]
 	colnd <- dim(NDATA)[2]
 	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
 	colnames(DDATA) <- colnames(NDATA)
 	i <- 1
 	for(i in 1:rownd){
 		j <- 1
 		for(j in 1:colnd){
 			if(is.na(NDATA[i,j])==FALSE){
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
 				}
 				if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
 				}
 				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
 				DDATA[i,j] = NDATA[i,j]
 			}
 			j = j + 1
 		}
 		i = i + 1
 	}
 	DDATA
 }
 #MajorFunction#This is the function that does everything else
 THEFT <- function(){
 	#Set working directory based on the directory of the series matrix file Currently only works for windows
 	wd <- getwd()
 	#list.files()
 	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
 	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
 	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
 	GSEfloc <- list.files()[GSEfileloc]
 	#ALL DATA FILES WILL BE CLEANED
 	if(numDAT == 1){
 		#indexing the data files
 		n <- 1
 		for(n in 1: length(GSEfloc)){
 			alz <- GSEfloc[n]
 			#Working with the wordy part of the document
 			alzword <- alz %>%
 				read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 				filter(grepl("!Sample",X1))%>%
 				filter(!grepl("!Sample_contact",X1))
 			#Getting the GPL file
 			genena <- grep("_platform_id",alzword$X1) %>%
 				alzword$X2[.] %>%
 				str_trim(.) %>%
 				paste0("^",.,"\\D") %>%
 				grep(.,list.files()) %>%
 				list.files()[.]
 			#Find out if it is a soft GPL file or not
 			soft <- strsplit(genena,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				grepl("soft",.)
 			##Changing row names and column names:
 			ALZWORD <- t(alzword)
 			rownames(ALZWORD)=NULL
 			colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 			ALZWORD <- chngrownm(ALZWORD)[-1,]
 			ALZWORD <- ALZWORD%>%
 				as.data.frame()%>%
 				dplyr::select(-starts_with("col"))
 			##Reorganizing information within the columns and final clinical data
 			ALZWORDF <- cinfo(ALZWORD)
 			#Working with Actual Data part of file
 			alzdat <- alz %>%
 				read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 			ALZDAT <- t(alzdat[,-1])
 			rownames(ALZDAT)=NULL
 			##Is there a clean version of the GPL file available?
 			gplnum <- strsplit(genena,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.)
 			clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 			if(clfileex >= 1){
 			#use the clean version
 			geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 			}
 			else if(clfileex == 0){
 			##Lets Create a clean version
 			##Gene ID to Gene Name
 				if(soft == TRUE){
 					#Check to see if there is already a file containing information on soft files
 					fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 					if(fileex == 1){
 						#Check to see if this GPL soft file has been used before
 						IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 							.$GPL_FILE_NUM%>%
 							grepl(gplnum,.) %>%
 							sum()
 						if(IDF == 1){
 							IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 								.$GPL_FILE_NUM%>%
 								grep(gplnum,.)
 							idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 									.$LOC_ID %>%
 									.[IDLOCAL]
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 						else if(IDF == 0){
 							#No information on this particular GPL file
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 								t(.) %>%
 								grep("^ID\\s*$",.) %>%
 								-1
 							cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 								cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 					}
 					else if(fileex == 0){
 						#We must create a file that we can access for later use
 						idLOCGPL <- genena %>%
 							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 							t(.) %>%
 							grep("^ID\\s*$",.) %>%
 							-1
 						Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 						colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 						write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 					}
 	 			}
 				else if(soft == FALSE){
 					geneIDNam <- genena %>%
 						read_delim(delim="\t",comment = "#")%>%
 						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 				}
 				##Labeling the gene IDs without names
 				geneIDNam <- NAFIXING(geneIDNam)
 				##remove the whitespace
 				geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 				##Here is the clean version
 				write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 			}
 			##Changing the gene ID to gene name
 			ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 			colnames(ALZDAT) = ALZDAT1[1,]
 			##Adjusting the column names aka the gene names
 			colnames(ALZDAT) <- gcnames(ALZDAT)
 			#Full RAW Data
 			Fullalzdwr <- ALZDAT %>%
 				as.data.frame() %>%
 				cbind(ALZWORDF,.)
 			#Raw file is output
 			nfnaex <- strsplit(alz,"[\\]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.) %>%
 				c("GSE",.,"aftexcel.txt") %>%
 				paste(collapse = "")
 			write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 			#Now for the discretization part
 			##get the wordy part again
 			rawword <- t(ALZWORDF)
 			##where is ID_REF located
 			hereim <- grep("ID_REF",rownames(rawword))
 			##Subject Names GSM...
 			subjnam <- rawword[hereim,]
 			##Getting the names for the rows
 			namedarows <- rownames(rawword)[-hereim] %>%
 				as.data.frame()
 			RAWWORD <- rawword[-hereim,] %>%
 				as.data.frame() %>%
 				bind_cols(namedarows,.)
 			z <- 1
 			naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 			for(z in 1:dim(RAWWORD)[1]){
 				naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 				z <- z + 1
 			}
 			colnames(naroww) <- "ROW_NAs"
 			RAWWORD <- bind_cols(RAWWORD,naroww)
 			roALZna <- t(ALZDAT) %>%
 				rownames(.) %>%
 				as.data.frame(.)
 			colnames(roALZna) <- "ID_REF"
 			RAWDAT <- t(ALZDAT) %>%
 				as.data.frame(.)
 			colnames(RAWDAT) <- NULL
 			rownames(RAWDAT) <- NULL
 			RAWDAT2 <- RAWDAT %>%
 				cbind(roALZna,.) %>%
 				dplyr::arrange(.,ID_REF)
 			##Editing the file for R processing
 			RAWDATID <- RAWDAT2[,1] %>%
 				as.matrix(.)
 			RAWDATNUM <- RAWDAT2[,-1] %>%
 				mapply(.,FUN = as.numeric) %>%
 				t(.)
 			##Consolidating genes with the same name
 			###create empty matrix of size equal  to tabRDATID
 			tabRDATID <- table(RAWDATID)
 			NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 			j <- 1
 			for(j in 1:length(tabRDATID)){
 				##Putting the ones without duplicates in their new homes
 				if(tabRDATID[j] == 1){
 					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 				}
 				##Averaging duplicates and putting them in their new homes
 				else if(tabRDATID[j] > 1){
 					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 				}
 				j <- j + 1
 			}
 			##Scaling the Data
 			scrawdat <- NuRDATN%>%
 				scale()
 			attr(scrawdat,"scaled:center") <- NULL
 			attr(scrawdat,"scaled:scale") <- NULL
 			colnames(scrawdat) <- rownames(tabRDATID)
 			##Discretized the Data
 			dialzdat <- scrawdat %>%
 				dndat(.) %>%
 				t()%>%
 				as.data.frame(.)
 			colnames(dialzdat) <- rownames(RAWDATNUM)
 			##setting "ID_REF" as a new variable
 			geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 			colnames(geneNAM) <- "ID_REF"
 			rownames(dialzdat) <- NULL
 			dialzdat <-bind_cols(geneNAM,dialzdat)
 			##NAs in a column
 			x <- 2
 			nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 			nacol[1,1] = "COL_NAs"
 			for(x in 2:dim(dialzdat)[2]){
 				nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 				x <- x + 1
 			}
 			colnames(nacol) <- colnames(dialzdat)
 			dialzdat <- bind_rows(dialzdat,nacol)
 			##NAs in a row
 			y <- 1
 			narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 			for(y in 1:dim(dialzdat)[1]){
 				narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 				y <- y + 1
 			}
 			colnames(narowd) <- "ROW_NAs"
 			dialzdat <- bind_cols(dialzdat,narowd)
 			colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 			colnames(RAWWORD) <- colnames(dialzdat)
 			##converting to character so that the clinical can be brought together with discrete data
 			k <- 2
 			for(k in 2:dim(dialzdat)[2]-1){
 				dialzdat[,k] <- as.character(dialzdat[,k])
 				k <- k + 1
 			}
 			#The End the full data
 			Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 			#Produces Discrete file
 			nfnaex2 <- strsplit(alz,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.) %>%
 				c("GSE",.,"dscrt.txt") %>%
 				paste(collapse = "")
 			write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 			n <- n +1
 		}
 	}
 	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
 	else if(numDAT == 2){
 		#All the files you want to analyze
 		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
 		if(length(ANDIS) == 0){
 			#Spit out a warning
 			warning("You did not select any files and so no cleaning will be performed")
 		} else{
 			#indexing the data files
 			n <- 1
 			for(n in 1: length(ANDIS)){
 				alz <- ANDIS[n]
 				#Working with the wordy part of the document
 				alzword <- alz %>%
 					read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 					filter(grepl("!Sample",X1))%>%
 					filter(!grepl("!Sample_contact",X1))
 				#Getting the GPL file
 				genena <- grep("_platform_id",alzword$X1) %>%
 					alzword$X2[.] %>%
 					str_trim(.) %>%
 					paste0("^",.,"\\D") %>%
 					grep(.,list.files()) %>%
 					list.files()[.]
 				#Find out if it is a soft GPL file or not
 				soft <- strsplit(genena,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					grepl("soft",.)
 				##Changing row names and column names:
 				ALZWORD <- t(alzword)
 				rownames(ALZWORD)=NULL
 				colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 				ALZWORD <- chngrownm(ALZWORD)[-1,]
 				ALZWORD <- ALZWORD%>%
 					as.data.frame()%>%
 					dplyr::select(-starts_with("col"))
 				##Reorganizing information within the columns and final clinical data
 				ALZWORDF <- cinfo(ALZWORD)
 				#Working with Actual Data part of file
 				alzdat <- alz %>%
 					read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 				ALZDAT <- t(alzdat[,-1])
 				rownames(ALZDAT)=NULL
 				##Is there a clean version of the GPL file available?
 				gplnum <- strsplit(genena,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.)
 				clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 				if(clfileex >= 1){
 				#use the clean version
 				geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 				}
 				else if(clfileex == 0){
 				##Lets Create a clean version
 				##Gene ID to Gene Name
 					if(soft == TRUE){
 						#Check to see if there is already a file containing information on soft files
 						fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 						if(fileex == 1){
 							#Check to see if this GPL soft file has been used before
 							IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 								.$GPL_FILE_NUM%>%
 								grepl(gplnum,.) %>%
 								sum()
 							if(IDF == 1){
 								IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 									.$GPL_FILE_NUM%>%
 									grep(gplnum,.)
 								idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 										.$LOC_ID %>%
 										.[IDLOCAL]
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							}
 							else if(IDF == 0){
 								#No information on this particular GPL file
 								idLOCGPL <- genena %>%
 									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 									t(.) %>%
 									grep("^ID\\s*$",.) %>%
 									-1
 								cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 									cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 							}
 						}
 						else if(fileex == 0){
 							#We must create a file that we can access for later use
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 								t(.) %>%
 								grep("^ID\\s*$",.) %>%
 								-1
 							Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 							colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 							write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 						}
 		 			}
 					else if(soft == FALSE){
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",comment = "#")%>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 					}
 					##Labeling the gene IDs without names
 						geneIDNam <- NAFIXING(geneIDNam)
 					##remove the whitespace
 					geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 					##Here is the clean version
 					write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 				}
 				##Changing the gene ID to gene name
 				ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 				colnames(ALZDAT) = ALZDAT1[1,]
 				##Adjusting the column names aka the gene names
 				colnames(ALZDAT) <- gcnames(ALZDAT)
 				#Full RAW Data
 				Fullalzdwr <- ALZDAT %>%
 					as.data.frame() %>%
 					cbind(ALZWORDF,.)
 				#Raw file is output
 				nfnaex <- strsplit(alz,"[\\]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.) %>%
 					c("GSE",.,"aftexcel.txt") %>%
 					paste(collapse = "")
 				write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 				#Now for the discretization part
 				##get the wordy part again
 				rawword <- t(ALZWORDF)
 				##where is ID_REF located
 				hereim <- grep("ID_REF",rownames(rawword))
 				##Subject Names GSM...
 				subjnam <- rawword[hereim,]
 				##Getting the names for the rows
 				namedarows <- rownames(rawword)[-hereim] %>%
 					as.data.frame()
 				RAWWORD <- rawword[-hereim,] %>%
 					as.data.frame() %>%
 					bind_cols(namedarows,.)
 				z <- 1
 				naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 				for(z in 1:dim(RAWWORD)[1]){
 					naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 					z <- z + 1
 				}
 				colnames(naroww) <- "ROW_NAs"
 				RAWWORD <- bind_cols(RAWWORD,naroww)
 				roALZna <- t(ALZDAT) %>%
 					rownames(.) %>%
 					as.data.frame(.)
 				colnames(roALZna) <- "ID_REF"
 				RAWDAT <- t(ALZDAT) %>%
 					as.data.frame(.)
 				colnames(RAWDAT) <- NULL
 				rownames(RAWDAT) <- NULL
 				RAWDAT2 <- RAWDAT %>%
 					cbind(roALZna,.) %>%
 					dplyr::arrange(.,ID_REF)
 				##Editing the file for R processing
 				RAWDATID <- RAWDAT2[,1] %>%
 					as.matrix(.)
 				RAWDATNUM <- RAWDAT2[,-1] %>%
 					mapply(.,FUN = as.numeric) %>%
 					t(.)
 				##Consolidating genes with the same name
 				###create empty matrix of size equal  to tabRDATID
 				tabRDATID <- table(RAWDATID)
 				NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 				j <- 1
 				for(j in 1:length(tabRDATID)){
 					##Putting the ones without duplicates in their new homes
 					if(tabRDATID[j] == 1){
 						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 					}
 					##Averaging duplicates and putting them in their new homes
 					else if(tabRDATID[j] > 1){
 						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 					}
 					j <- j + 1
 				}
 				##Scaling the Data
 				scrawdat <- NuRDATN%>%
 					scale()
 				attr(scrawdat,"scaled:center") <- NULL
 				attr(scrawdat,"scaled:scale") <- NULL
 				colnames(scrawdat) <- rownames(tabRDATID)
 				##Discretized the Data
 				dialzdat <- scrawdat %>%
 					dndat(.) %>%
 					t()%>%
 					as.data.frame(.)
 				colnames(dialzdat) <- rownames(RAWDATNUM)
 				##setting "ID_REF" as a new variable
 				geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 				colnames(geneNAM) <- "ID_REF"
 				rownames(dialzdat) <- NULL
 				dialzdat <-bind_cols(geneNAM,dialzdat)
 				##NAs in a column
 				x <- 2
 				nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 				nacol[1,1] = "COL_NAs"
 				for(x in 2:dim(dialzdat)[2]){
 					nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 					x <- x + 1
 				}
 				colnames(nacol) <- colnames(dialzdat)
 				dialzdat <- bind_rows(dialzdat,nacol)
 				##NAs in a row
 				y <- 1
 				narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 				for(y in 1:dim(dialzdat)[1]){
 					narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 					y <- y + 1
 				}
 				colnames(narowd) <- "ROW_NAs"
 				dialzdat <- bind_cols(dialzdat,narowd)
 				colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 				colnames(RAWWORD) <- colnames(dialzdat)
 				##converting to character so that the clinical can be brought together with discrete data
 				k <- 2
 				for(k in 2:dim(dialzdat)[2]-1){
 					dialzdat[,k] <- as.character(dialzdat[,k])
 					k <- k + 1
 				}
 				#The End the full data
 				Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 				#Produces Discrete file
 				nfnaex2 <- strsplit(alz,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.) %>%
 					c("GSE",.,"dscrt.txt") %>%
 					paste(collapse = "")
 				write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 				n <- n + 1
 			}
 		}
 	}
 }