Efrain Gonzalez / Cleaning and Fixing Data with R

1

2

#Efrain H. Gonzalez

2

#Efrain H. Gonzalez

3

#6/22/2017

3

#6/22/2017

4

options(digits = 11)

4

options(digits = 11)

5

#Libraries required to run the code

5

#Libraries required to run the code

6

library(pryr)

6

library(pryr)

7

library(MASS)

7

library(MASS)

8

library(dplyr)

8

library(dplyr)

9

library(tidyr)

9

library(tidyr)

10

library(readr)

10

library(readr)

11

library(stringr)

11

library(stringr)

12

13

14

#Necessary Functions

14

#Necessary Functions

15

#1#Function for handling the changing of row names and column names

15

#1#Function for handling the changing of row names and column names

16

chngrownm <- function(mat){

16

chngrownm <- function(mat){

17

row <- dim(mat)[1]

17

row <- dim(mat)[1]

18

col <- dim(mat)[2]

18

col <- dim(mat)[2]

19

e <- 1

19

e <- 1

20

r <- 1

20

r <- 1

21

a <- 1

21

a <- 1

22

h <- 1

22

h <- 1

23

g <- 1

23

g <- 1

24

o <- 1

24

o <- 1

25

for(e in 1:col){

25

for(e in 1:col){

26

if("!Sample_source_name_ch1"==mat[1,e]){

26

if("!Sample_source_name_ch1"==mat[1,e]){

27

colnames(mat)[e] <- "Brain_Region"

27

colnames(mat)[e] <- "Brain_Region"

28

} else if("!Sample_title" == mat[1,e]){

28

} else if("!Sample_title" == mat[1,e]){

29

colnames(mat)[e] <- "Title"

29

colnames(mat)[e] <- "Title"

30

} else if("!Sample_geo_accession" == mat[1,e]){

30

} else if("!Sample_geo_accession" == mat[1,e]){

31

colnames(mat)[e] <- "ID_REF"

31

colnames(mat)[e] <- "ID_REF"

32

} else{

32

} else{

33

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

33

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

34

colnames(mat)[e] <- paste0("Sex",r)

34

colnames(mat)[e] <- paste0("Sex",r)

35

r = r + 1

35

r = r + 1

36

}

36

}

37

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

37

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

38

colnames(mat)[e] <- paste0("PMI",a)

38

colnames(mat)[e] <- paste0("PMI",a)

39

a = a + 1

39

a = a + 1

40

}

40

}

41

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

41

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

42

colnames(mat)[e] <- paste0("Age",h)

42

colnames(mat)[e] <- paste0("Age",h)

43

h = h + 1

43

h = h + 1

44

}

44

}

45

if(grepl("braak|b&b",mat[2,e])==TRUE){

45

if(grepl("braak|b&b",mat[2,e])==TRUE){

46

colnames(mat)[e] <- paste0("Braak",g)

46

colnames(mat)[e] <- paste0("Braak",g)

47

g = g + 1

47

g = g + 1

48

}

48

}

49

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

49

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

50

colnames(mat)[e] <- paste0("Group",o)

50

colnames(mat)[e] <- paste0("Group",o)

51

o = o + 1

51

o = o + 1

52

}

52

}

53

54

}

54

}

55

e = e + 1

55

e = e + 1

56

}

56

}

57

mat

57

mat

58

}

58

}

59

60

#2#Function for reorganizing information within the columns

60

#2#Function for reorganizing information within the columns

61

cinfo <- function(mat){

61

cinfo <- function(mat){

62

col <- dim(mat)[2]

62

col <- dim(mat)[2]

63

j <-2

63

j <-2

64

for(j in 2:col){

64

for(j in 2:col){

65

if(grepl("Group",colnames(mat)[j]) == TRUE){

65

if(grepl("Group",colnames(mat)[j]) == TRUE){

66

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

66

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

67

} else if(grepl("Age",colnames(mat)[j])==TRUE){

67

} else if(grepl("Age",colnames(mat)[j])==TRUE){

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

69

as.integer()

69

as.integer()

70

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

70

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

72

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

73

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

as.numeric()

74

as.numeric()

75

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

75

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

76

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

76

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

77

as.roman()%>%

77

as.roman()%>%

78

as.integer()

78

as.integer()

79

}

79

}

80

j=j+1

80

j=j+1

81

}

81

}

82

mat

82

mat

83

}

83

}

84

85

#3#Function for labeling the gene IDs without names

85

#3#Function for labeling the gene IDs without names

86

NAFIXING <- function(GIDNAM){

86

NAFIXING <- function(GIDNAM){

87

row <- dim(GIDNAM)[1]

87

row <- dim(GIDNAM)[1]

88

i <- 1

88

i <- 1

89

for(i in 1:row){

89

for(i in 1:row){

90

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

90

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

91

GIDNAM[i,2] <- GIDNAM[i,1]

91

GIDNAM[i,2] <- GIDNAM[i,1]

92

}

92

}

93

i <- i + 1

93

i <- i + 1

94

}

94

}

95

GIDNAM

95

GIDNAM

96

}

96

}

97

98

#4#Function for changing the gene ID to gene name

98

#4#Function for changing the gene ID to gene name

99

cgeneID <- function(GeneName,DATA){

99

cgeneID <- function(GeneName,DATA){

100

nj <- t(GeneName)

100

nj <- t(GeneName)

101

nq <- t(DATA)

101

nq <- t(DATA)

102

colGene <- dim(nj)[2]

102

colGene <- dim(nj)[2]

103

colDATA <- dim(nq)[2]

103

colDATA <- dim(nq)[2]

104

j <- 1

104

j <- 1

105

for(j in 1:colDATA){

105

for(j in 1:colDATA){

106

#where is that gene id located within the GPL file

106

#where is that gene id located within the GPL file

107

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

107

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

108

if(is.na(sum(chngreq))==FALSE){

108

if(is.na(sum(chngreq))==FALSE){

109

if(sum(chngreq) > 0){

109

if(sum(chngreq) > 0){

110

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

110

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

111

}

111

}

112

}

112

}

113

j <- j + 1

113

j <- j + 1

114

}

114

}

115

nq

115

nq

116

}

116

}

117

#cgeneID <- function(GeneName,DATA){

117

#cgeneID <- function(GeneName,DATA){

118

# colGene <- dim(GeneName)[2]

118

# colGene <- dim(GeneName)[2]

119

# j <- 1

119

# j <- 1

120

# for(j in 1:colGene){

120

# for(j in 1:colGene){

121

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

121

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

122

# if(is.na(sum(chngsreq))==FALSE){

122

# if(is.na(sum(chngsreq))==FALSE){

123

# if(sum(chngsreq) > 0){

123

# if(sum(chngsreq) > 0){

124

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

124

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

125

# }

125

# }

126

# }

126

# }

127

# j = j+1

127

# j = j+1

128

# }

128

# }

129

# DATA

129

# DATA

130

#}

130

#}

131

132

#5#Function for adjusting the gene names

132

#5#Function for adjusting the gene names

133

gcnames <- function(DiData,usecol=1){

133

gcnames <- function(DiData,usecol=1){

134

nuruns <- dim(DiData)[2]

134

nuruns <- dim(DiData)[2]

135

i = 1

135

i = 1

136

nwnam <- rep("0",length.out=nuruns)

136

nwnam <- rep("0",length.out=nuruns)

137

for(i in 1:nuruns){

137

for(i in 1:nuruns){

138

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

138

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

139

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

139

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

140

} else{

140

} else{

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

142

}

142

}

143

144

}

144

}

145

nwnam

145

nwnam

146

147

}

147

}

148

149

#6# Function for discretizing the data

149

#6# Function for discretizing the data

150

dndat <- function(NDATA){

150

dndat <- function(NDATA){

151

rownd <- dim(NDATA)[1]

151

rownd <- dim(NDATA)[1]

152

colnd <- dim(NDATA)[2]

152

colnd <- dim(NDATA)[2]

153

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

153

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

154

colnames(DDATA) <- colnames(NDATA)

154

colnames(DDATA) <- colnames(NDATA)

155

i <- 1

155

i <- 1

156

for(i in 1:rownd){

156

for(i in 1:rownd){

157

j <- 1

157

j <- 1

158

for(j in 1:colnd){

158

for(j in 1:colnd){

159

if(is.na(NDATA[i,j])==FALSE){

159

if(is.na(NDATA[i,j])==FALSE){

160

161

if(NDATA[i,j] < -1){

161

if(NDATA[i,j] < -1){

162

DDATA[i,j]=0L

162

DDATA[i,j]=0L

163

} else if(NDATA[i,j] > 1){

163

} else if(NDATA[i,j] > 1){

164

DDATA[i,j]=2L

164

DDATA[i,j]=2L

165

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

165

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

166

DDATA[i,j]=1L

166

DDATA[i,j]=1L

167

}

167

}

168

} else{

168

} else{

169

DDATA[i,j] = NDATA[i,j]

169

DDATA[i,j] = NDATA[i,j]

170

}

170

}

171

j = j + 1

171

j = j + 1

172

}

172

}

173

i = i + 1

173

i = i + 1

174

}

174

}

175

DDATA

175

DDATA

176

}

176

}

177

178

179

#MajorFunction#This is the function that does everything else

179

#MajorFunction#This is the function that does everything else

180

THEFT <- function(){

180

THEFT <- function(){

181

#Set working directory based on the directory of the series matrix file Currently only works for windows

181

#Set working directory based on the directory of the series matrix file Currently only works for windows

182

wd <- getwd()

182

wd <- getwd()

183

#list.files()

183

#list.files()

184

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

184

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

185

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

185

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

186

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

186

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

187

GSEfloc <- list.files()[GSEfileloc]

187

GSEfloc <- list.files()[GSEfileloc]

188

#ALL DATA FILES WILL BE CLEANED

188

#ALL DATA FILES WILL BE CLEANED

189

if(numDAT == 1){

189

if(numDAT == 1){

190

#indexing the data files

190

#indexing the data files

191

n <- 1

191

n <- 1

192

for(n in 1: length(GSEfloc)){

192

for(n in 1: length(GSEfloc)){

193

alz <- GSEfloc[n]

193

alz <- GSEfloc[n]

194

195

#Working with the wordy part of the document

195

#Working with the wordy part of the document

196

alzword <- alz %>%

196

alzword <- alz %>%

197

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

197

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

198

filter(grepl("!Sample",X1))%>%

198

filter(grepl("!Sample",X1))%>%

199

filter(!grepl("!Sample_contact",X1))

199

filter(!grepl("!Sample_contact",X1))

200

201

#Getting the GPL file

201

#Getting the GPL file

202

genena <- grep("_platform_id",alzword$X1) %>%

202

genena <- grep("_platform_id",alzword$X1) %>%

203

alzword$X2[.] %>%

203

alzword$X2[.] %>%

204

str_trim(.) %>%

204

str_trim(.) %>%

205

paste0("^",.,"\\D") %>%

205

paste0("^",.,"\\D") %>%

206

grep(.,list.files()) %>%

206

grep(.,list.files()) %>%

207

list.files()[.]

207

list.files()[.]

208

209

#Find out if it is a soft GPL file or not

209

#Find out if it is a soft GPL file or not

210

soft <- strsplit(genena,"[\\|/]") %>%

210

soft <- strsplit(genena,"[\\|/]") %>%

211

.[[1]] %>%

211

.[[1]] %>%

212

.[length(.)] %>%

212

.[length(.)] %>%

213

grepl("soft",.)

213

grepl("soft",.)

214

215

##Changing row names and column names:

215

##Changing row names and column names:

216

ALZWORD <- t(alzword)

216

ALZWORD <- t(alzword)

217

rownames(ALZWORD)=NULL

217

rownames(ALZWORD)=NULL

218

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

218

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

219

ALZWORD <- chngrownm(ALZWORD)[-1,]

219

ALZWORD <- chngrownm(ALZWORD)[-1,]

220

ALZWORD <- ALZWORD%>%

220

ALZWORD <- ALZWORD%>%

221

as.data.frame(.,stringsAsFactors = FALSE)%>%

221

as.data.frame(.,stringsAsFactors = FALSE)%>%

222

dplyr::select(-starts_with("col"))

222

dplyr::select(-starts_with("col"))

223

224

##Reorganizing information within the columns and final clinical data

224

##Reorganizing information within the columns and final clinical data

225

ALZWORDF <- cinfo(ALZWORD)

225

ALZWORDF <- cinfo(ALZWORD)

226

227

228

#Working with Actual Data part of file

228

#Working with Actual Data part of file

229

alzdat <- alz %>%

229

alzdat <- alz %>%

230

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

230

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

231

ALZDAT <- t(alzdat[,-1])

231

ALZDAT <- t(alzdat[,-1])

232

rownames(ALZDAT)=NULL

232

rownames(ALZDAT)=NULL

233

234

##Is there a clean version of the GPL file available?

234

##Is there a clean version of the GPL file available?

235

gplnum <- strsplit(genena,"[\\|/]") %>%

235

gplnum <- strsplit(genena,"[\\|/]") %>%

236

.[[1]] %>%

236

.[[1]] %>%

237

.[length(.)] %>%

237

.[length(.)] %>%

238

gsub("\\D","",.)

238

gsub("\\D","",.)

239

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

239

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

240

if(clfileex >= 1){

240

if(clfileex >= 1){

241

#use the clean version

241

#use the clean version

242

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

242

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

243

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

243

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

244

245

} else if(clfileex == 0){

245

} else if(clfileex == 0){

246

##Lets Create a clean version

246

##Lets Create a clean version

247

248

##Gene ID to Gene Name

248

##Gene ID to Gene Name

249

if(soft == TRUE){

249

if(soft == TRUE){

250

#Check to see if there is already a file containing information on soft files

250

#Check to see if there is already a file containing information on soft files

251

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

251

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

252

if(fileex == 1){

252

if(fileex == 1){

253

#Check to see if this GPL soft file has been used before

253

#Check to see if this GPL soft file has been used before

254

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

254

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

255

.$GPL_FILE_NUM%>%

255

.$GPL_FILE_NUM%>%

256

grepl(gplnum,.) %>%

256

grepl(gplnum,.) %>%

257

sum()

257

sum()

258

if(IDF == 1){

258

if(IDF == 1){

259

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

259

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

260

.$GPL_FILE_NUM%>%

260

.$GPL_FILE_NUM%>%

261

grep(gplnum,.)

261

grep(gplnum,.)

262

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

262

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

263

.$LOC_ID %>%

263

.$LOC_ID %>%

264

.[IDLOCAL]

264

.[IDLOCAL]

265

geneIDNam <- genena %>%

265

geneIDNam <- genena %>%

266

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

266

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

267

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

267

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

268

} else if(IDF == 0){

268

} else if(IDF == 0){

269

#No information on this particular GPL file

269

#No information on this particular GPL file

270

idLOCGPL <- genena %>%

270

idLOCGPL <- genena %>%

271

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

271

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

272

t(.) %>%

272

t(.) %>%

273

grep("^ID\\s*$",.) %>%

273

grep("^ID\\s*$",.) %>%

274

-1

274

-1

275

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

275

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

276

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

276

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

277

geneIDNam <- genena %>%

277

geneIDNam <- genena %>%

278

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

278

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

279

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

279

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

280

}

280

}

281

} else if(fileex == 0){

281

} else if(fileex == 0){

282

#We must create a file that we can access for later use

282

#We must create a file that we can access for later use

283

idLOCGPL <- genena %>%

283

idLOCGPL <- genena %>%

284

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

284

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

285

t(.) %>%

285

t(.) %>%

286

grep("^ID\\s*$",.) %>%

286

grep("^ID\\s*$",.) %>%

287

-1

287

-1

288

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

288

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

289

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

289

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

290

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

290

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

291

geneIDNam <- genena %>%

291

geneIDNam <- genena %>%

292

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

292

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

293

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

293

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

294

}

294

}

295

} else if(soft == FALSE){

295

} else if(soft == FALSE){

296

geneIDNam <- genena %>%

296

geneIDNam <- genena %>%

297

read_delim(delim="\t",comment = "#")%>%

297

read_delim(delim="\t",comment = "#")%>%

298

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

298

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

299

}

299

}

300

301

##Labeling the gene IDs without names

301

##Labeling the gene IDs without names

302

geneIDNam <- NAFIXING(geneIDNam)

302

geneIDNam <- NAFIXING(geneIDNam)

303

304

##remove the whitespace

304

##remove the whitespace

305

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

305

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

306

307

##Here is the clean version

307

##Here is the clean version

308

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

308

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

309

}

309

}

310

311

312

313

##Changing the gene ID to gene name

313

##Changing the gene ID to gene name

314

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

314

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

315

colnames(ALZDAT) = ALZDAT1[1,]

315

colnames(ALZDAT) = ALZDAT1[1,]

316

317

318

##Adjusting the column names aka the gene names

318

##Adjusting the column names aka the gene names

319

colnames(ALZDAT) <- gcnames(ALZDAT)

319

colnames(ALZDAT) <- gcnames(ALZDAT)

320

321

322

#Full RAW Data

322

#Full RAW Data

323

Fullalzdwr <- ALZDAT %>%

323

Fullalzdwr <- ALZDAT %>%

324

as.data.frame(.,stringsAsFactors = FALSE) %>%

324

as.data.frame(.,stringsAsFactors = FALSE) %>%

325

cbind(ALZWORDF,.)

325

cbind(ALZWORDF,.)

326

327

#Raw file is output

327

#Raw file is output

328

nfnaex <- strsplit(alz,"[\\]") %>%

328

nfnaex <- strsplit(alz,"[\\]") %>%

329

.[[1]] %>%

329

.[[1]] %>%

330

.[length(.)] %>%

330

.[length(.)] %>%

331

gsub("\\D","",.) %>%

331

gsub("\\D","",.) %>%

332

c("GSE",.,"aftexcel.txt") %>%

332

c("GSE",.,"aftexcel.txt") %>%

333

paste(collapse = "")

333

paste(collapse = "")

334

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

334

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

335

336

337

338

#Now for the discretization part

338

#Now for the discretization part

339

##get the wordy part again

339

##get the wordy part again

340

rawword <- t(ALZWORDF)

340

rawword <- t(ALZWORDF)

341

342

##where is ID_REF located

342

##where is ID_REF located

343

hereim <- grep("ID_REF",rownames(rawword))

343

hereim <- grep("ID_REF",rownames(rawword))

344

345

##Subject Names GSM...

345

##Subject Names GSM...

346

subjnam <- rawword[hereim,]

346

subjnam <- rawword[hereim,]

347

348

##Getting the names for the rows

348

##Getting the names for the rows

349

namedarows <- rownames(rawword)[-hereim] %>%

349

namedarows <- rownames(rawword)[-hereim] %>%

350

as.data.frame(.,stringsAsFactors = FALSE)

350

as.data.frame(.,stringsAsFactors = FALSE)

351

RAWWORD <- rawword[-hereim,] %>%

351

RAWWORD <- rawword[-hereim,] %>%

352

as.data.frame(.,stringsAsFactors = FALSE) %>%

352

as.data.frame(.,stringsAsFactors = FALSE) %>%

353

bind_cols(namedarows,.)

353

bind_cols(namedarows,.)

354

z <- 1

354

z <- 1

355

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

355

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

356

for(z in 1:dim(RAWWORD)[1]){

356

for(z in 1:dim(RAWWORD)[1]){

357

if(sum(is.na(RAWWORD[z,])) > 0){

357

if(sum(is.na(RAWWORD[z,])) > 0){

358

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

358

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

359

}

359

}

360

if(length(grep("NA",RAWWORD[z,])) > 0){

360

if(length(grep("NA",RAWWORD[z,])) > 0){

361

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

361

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

362

}

362

}

363

z <- z + 1

363

z <- z + 1

364

}

364

}

365

366

colnames(naroww) <- "ROW_NAs"

366

colnames(naroww) <- "ROW_NAs"

367

RAWWORD <- bind_cols(RAWWORD,naroww)

367

RAWWORD <- bind_cols(RAWWORD,naroww)

368

369

370

roALZna <- t(ALZDAT) %>%

370

roALZna <- t(ALZDAT) %>%

371

rownames(.) %>%

371

rownames(.) %>%

372

as.data.frame(.,stringsAsFactors = FALSE)

372

as.data.frame(.,stringsAsFactors = FALSE)

373

colnames(roALZna) <- "ID_REF"

373

colnames(roALZna) <- "ID_REF"

374

375

RAWDAT <- t(ALZDAT) %>%

375

RAWDAT <- t(ALZDAT) %>%

376

as.data.frame(.,stringsAsFactors = FALSE)

376

as.data.frame(.,stringsAsFactors = FALSE)

377

colnames(RAWDAT) <- NULL

377

colnames(RAWDAT) <- NULL

378

rownames(RAWDAT) <- NULL

378

rownames(RAWDAT) <- NULL

379

380

RAWDAT2 <- RAWDAT %>%

380

RAWDAT2 <- RAWDAT %>%

381

cbind(roALZna,.) %>%

381

cbind(roALZna,.) %>%

382

dplyr::arrange(.,ID_REF)

382

dplyr::arrange(.,ID_REF)

383

384

##Editing the file for R processing

384

##Editing the file for R processing

385

RAWDATID <- RAWDAT2[,1] %>%

385

RAWDATID <- RAWDAT2[,1] %>%

386

as.matrix(.)

386

as.matrix(.)

387

388

RAWDATNUM <- RAWDAT2[,-1] %>%

388

RAWDATNUM <- RAWDAT2[,-1] %>%

389

mapply(.,FUN = as.numeric) %>%

389

mapply(.,FUN = as.numeric) %>%

390

t(.)

390

t(.)

391

392

##Consolidating genes with the same name

392

##Consolidating genes with the same name

393

###create empty matrix of size equal to tabRDATID

393

###create empty matrix of size equal to tabRDATID

394

tabRDATID <- table(RAWDATID)

394

tabRDATID <- table(RAWDATID)

395

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

395

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

396

j <- 1

396

j <- 1

397

for(j in 1:length(tabRDATID)){

397

for(j in 1:length(tabRDATID)){

398

##Putting the ones without duplicates in their new homes

398

##Putting the ones without duplicates in their new homes

399

if(tabRDATID[j] == 1){

399

if(tabRDATID[j] == 1){

400

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

400

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

401

} else if(tabRDATID[j] > 1){

401

} else if(tabRDATID[j] > 1){

402

##Averaging duplicates and putting them in their new homes

402

##Averaging duplicates and putting them in their new homes

403

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

403

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

404

}

404

}

405

j <- j + 1

405

j <- j + 1

406

}

406

}

407

408

##Scaling the Data

408

##Scaling the Data

409

scrawdat <- NuRDATN%>%

409

scrawdat <- NuRDATN%>%

410

scale()

410

scale()

411

attr(scrawdat,"scaled:center") <- NULL

411

attr(scrawdat,"scaled:center") <- NULL

412

attr(scrawdat,"scaled:scale") <- NULL

412

attr(scrawdat,"scaled:scale") <- NULL

413

colnames(scrawdat) <- rownames(tabRDATID)

413

colnames(scrawdat) <- rownames(tabRDATID)

414

415

#Outputting the Z-score file

416

nfnzsc <- strsplit(alz,"[\\]") %>%

417

.[[1]] %>%

418

.[length(.)] %>%

419

gsub("\\D","",.) %>%

420

c("GSE",.,"zscore.txt") %>%

421

paste(collapse = "")

422

zscraw <- scrawdat %>%

423

t()%>%

424

as.data.frame(.,stringsAsFactors = FALSE)

425

colnames(zscraw) <- subjnam

426

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)

427

415

##Discretized the Data

428

##Discretized the Data

416

dialzdat <- scrawdat %>%

429

dialzdat <- scrawdat %>%

417

dndat(.) %>%

430

dndat(.) %>%

418

t()%>%

431

t()%>%

419

as.data.frame(.,stringsAsFactors = FALSE)

432

as.data.frame(.,stringsAsFactors = FALSE)

420

colnames(dialzdat) <- rownames(RAWDATNUM)

433

colnames(dialzdat) <- rownames(RAWDATNUM)

421

434

422

##setting "ID_REF" as a new variable

435

##setting "ID_REF" as a new variable

423

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

436

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

424

colnames(geneNAM) <- "ID_REF"

437

colnames(geneNAM) <- "ID_REF"

425

rownames(dialzdat) <- NULL

438

rownames(dialzdat) <- NULL

426

dialzdat <-bind_cols(geneNAM,dialzdat)

439

dialzdat <-bind_cols(geneNAM,dialzdat)

427

440

428

##NAs in a column

441

##NAs in a column

429

x <- 2

442

x <- 2

430

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

443

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

431

nacol[1,1] = "COL_NAs"

444

nacol[1,1] = "COL_NAs"

432

for(x in 2:dim(dialzdat)[2]){

445

for(x in 2:dim(dialzdat)[2]){

433

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

446

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

434

x <- x + 1

447

x <- x + 1

435

}

448

}

436

colnames(nacol) <- colnames(dialzdat)

449

colnames(nacol) <- colnames(dialzdat)

437

dialzdat <- bind_rows(dialzdat,nacol)

450

dialzdat <- bind_rows(dialzdat,nacol)

438

451

439

##NAs in a row

452

##NAs in a row

440

y <- 1

453

y <- 1

441

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

454

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

442

for(y in 1:dim(dialzdat)[1]){

455

for(y in 1:dim(dialzdat)[1]){

443

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

456

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

444

y <- y + 1

457

y <- y + 1

445

}

458

}

446

colnames(narowd) <- "ROW_NAs"

459

colnames(narowd) <- "ROW_NAs"

447

dialzdat <- bind_cols(dialzdat,narowd)

460

dialzdat <- bind_cols(dialzdat,narowd)

448

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

461

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

449

colnames(RAWWORD) <- colnames(dialzdat)

462

colnames(RAWWORD) <- colnames(dialzdat)

450

##converting to character so that the clinical can be brought together with discrete data

463

##converting to character so that the clinical can be brought together with discrete data

451

k <- 2

464

k <- 2

452

for(k in 2:dim(dialzdat)[2]-1){

465

for(k in 2:dim(dialzdat)[2]-1){

453

dialzdat[,k] <- as.character(dialzdat[,k])

466

dialzdat[,k] <- as.character(dialzdat[,k])

454

k <- k + 1

467

k <- k + 1

455

}

468

}

456

#The End the full data

469

#The End the full data

457

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

470

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

458

471

459

#Produces Discrete file

472

#Produces Discrete file

460

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

473

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

461

.[[1]] %>%

474

.[[1]] %>%

462

.[length(.)] %>%

475

.[length(.)] %>%

463

gsub("\\D","",.) %>%

476

gsub("\\D","",.) %>%

464

c("GSE",.,"dscrt.txt") %>%

477

c("GSE",.,"dscrt.txt") %>%

465

paste(collapse = "")

478

paste(collapse = "")

466

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

479

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

467

n <- n +1

480

n <- n +1

468

}

481

}

469

} else if(numDAT == 2){

482

} else if(numDAT == 2){

470

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

483

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

471

484

472

#All the files you want to analyze

485

#All the files you want to analyze

473

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

486

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

474

if(length(ANDIS) == 0){

487

if(length(ANDIS) == 0){

475

#Spit out a warning

488

#Spit out a warning

476

warning("You did not select any files and so no cleaning will be performed")

489

warning("You did not select any files and so no cleaning will be performed")

477

} else{

490

} else{

478

#indexing the data files

491

#indexing the data files

479

n <- 1

492

n <- 1

480

for(n in 1: length(ANDIS)){

493

for(n in 1: length(ANDIS)){

481

alz <- ANDIS[n]

494

alz <- ANDIS[n]

482

495

483

#Working with the wordy part of the document

496

#Working with the wordy part of the document

484

alzword <- alz %>%

497

alzword <- alz %>%

485

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

498

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

486

filter(grepl("!Sample",X1))%>%

499

filter(grepl("!Sample",X1))%>%

487

filter(!grepl("!Sample_contact",X1))

500

filter(!grepl("!Sample_contact",X1))

488

501

489

#Getting the GPL file

502

#Getting the GPL file

490

genena <- grep("_platform_id",alzword$X1) %>%

503

genena <- grep("_platform_id",alzword$X1) %>%

491

alzword$X2[.] %>%

504

alzword$X2[.] %>%

492

str_trim(.) %>%

505

str_trim(.) %>%

493

paste0("^",.,"\\D") %>%

506

paste0("^",.,"\\D") %>%

494

grep(.,list.files()) %>%

507

grep(.,list.files()) %>%

495

list.files()[.]

508

list.files()[.]

496

509

497

#Find out if it is a soft GPL file or not

510

#Find out if it is a soft GPL file or not

498

soft <- strsplit(genena,"[\\|/]") %>%

511

soft <- strsplit(genena,"[\\|/]") %>%

499

.[[1]] %>%

512

.[[1]] %>%

500

.[length(.)] %>%

513

.[length(.)] %>%

501

grepl("soft",.)

514

grepl("soft",.)

502

515

503

##Changing row names and column names:

516

##Changing row names and column names:

504

ALZWORD <- t(alzword)

517

ALZWORD <- t(alzword)

505

rownames(ALZWORD)=NULL

518

rownames(ALZWORD)=NULL

506

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

519

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

507

ALZWORD <- chngrownm(ALZWORD)[-1,]

520

ALZWORD <- chngrownm(ALZWORD)[-1,]

508

ALZWORD <- ALZWORD%>%

521

ALZWORD <- ALZWORD%>%

509

as.data.frame(.,stringsAsFactors = FALSE)%>%

522

as.data.frame(.,stringsAsFactors = FALSE)%>%

510

dplyr::select(-starts_with("col"))

523

dplyr::select(-starts_with("col"))

511

524

512

##Reorganizing information within the columns and final clinical data

525

##Reorganizing information within the columns and final clinical data

513

ALZWORDF <- cinfo(ALZWORD)

526

ALZWORDF <- cinfo(ALZWORD)

514

527

515

528

516

#Working with Actual Data part of file

529

#Working with Actual Data part of file

517

alzdat <- alz %>%

530

alzdat <- alz %>%

518

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

531

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

519

ALZDAT <- t(alzdat[,-1])

532

ALZDAT <- t(alzdat[,-1])

520

rownames(ALZDAT)=NULL

533

rownames(ALZDAT)=NULL

521

534

522

##Is there a clean version of the GPL file available?

535

##Is there a clean version of the GPL file available?

523

gplnum <- strsplit(genena,"[\\|/]") %>%

536

gplnum <- strsplit(genena,"[\\|/]") %>%

524

.[[1]] %>%

537

.[[1]] %>%

525

.[length(.)] %>%

538

.[length(.)] %>%

526

gsub("\\D","",.)

539

gsub("\\D","",.)

527

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

540

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

528

if(clfileex >= 1){

541

if(clfileex >= 1){

529

#use the clean version

542

#use the clean version

530

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

543

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

531

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

544

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

532

545

533

} else if(clfileex == 0){

546

} else if(clfileex == 0){

534

##Lets Create a clean version

547

##Lets Create a clean version

535

548

536

##Gene ID to Gene Name

549

##Gene ID to Gene Name

537

if(soft == TRUE){

550

if(soft == TRUE){

538

#Check to see if there is already a file containing information on soft files

551

#Check to see if there is already a file containing information on soft files

539

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

552

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

540

if(fileex == 1){

553

if(fileex == 1){

541

#Check to see if this GPL soft file has been used before

554

#Check to see if this GPL soft file has been used before

542

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

555

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

543

.$GPL_FILE_NUM%>%

556

.$GPL_FILE_NUM%>%

544

grepl(gplnum,.) %>%

557

grepl(gplnum,.) %>%

545

sum()

558

sum()

546

if(IDF == 1){

559

if(IDF == 1){

547

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

560

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

548

.$GPL_FILE_NUM%>%

561

.$GPL_FILE_NUM%>%

549

grep(gplnum,.)

562

grep(gplnum,.)

550

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

563

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

551

.$LOC_ID %>%

564

.$LOC_ID %>%

552

.[IDLOCAL]

565

.[IDLOCAL]

553

geneIDNam <- genena %>%

566

geneIDNam <- genena %>%

554

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

567

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

555

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

568

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

556

} else if(IDF == 0){

569

} else if(IDF == 0){

557

#No information on this particular GPL file

570

#No information on this particular GPL file

558

idLOCGPL <- genena %>%

571

idLOCGPL <- genena %>%

559

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

560

t(.) %>%

573

t(.) %>%

561

grep("^ID\\s*$",.) %>%

574

grep("^ID\\s*$",.) %>%

562

-1

575

-1

563

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

576

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

564

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

577

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

565

geneIDNam <- genena %>%

578

geneIDNam <- genena %>%

566

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

579

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

567

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

580

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

568

}

581

}

569

} else if(fileex == 0){

582

} else if(fileex == 0){

570

#We must create a file that we can access for later use

583

#We must create a file that we can access for later use

571

idLOCGPL <- genena %>%

584

idLOCGPL <- genena %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

585

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

573

t(.) %>%

586

t(.) %>%

574

grep("^ID\\s*$",.) %>%

587

grep("^ID\\s*$",.) %>%

575

-1

588

-1

576

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

589

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

577

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

590

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

578

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

591

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

579

geneIDNam <- genena %>%

592

geneIDNam <- genena %>%

580

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

593

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

581

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

594

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

582

}

595

}

583

} else if(soft == FALSE){

596

} else if(soft == FALSE){

584

geneIDNam <- genena %>%

597

geneIDNam <- genena %>%

585

read_delim(delim="\t",comment = "#")%>%

598

read_delim(delim="\t",comment = "#")%>%

586

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

599

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

587

}

600

}

588

601

589

##Labeling the gene IDs without names

602

##Labeling the gene IDs without names

590

geneIDNam <- NAFIXING(geneIDNam)

603

geneIDNam <- NAFIXING(geneIDNam)

591

604

592

##remove the whitespace

605

##remove the whitespace

593

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

606

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

594

607

595

##Here is the clean version

608

##Here is the clean version

596

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

609

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

597

}

610

}

598

611

599

612

600

613

601

##Changing the gene ID to gene name

614

##Changing the gene ID to gene name

602

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

615

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

603

colnames(ALZDAT) = ALZDAT1[1,]

616

colnames(ALZDAT) = ALZDAT1[1,]

604

617

605

618

606

##Adjusting the column names aka the gene names

619

##Adjusting the column names aka the gene names

607

colnames(ALZDAT) <- gcnames(ALZDAT)

620

colnames(ALZDAT) <- gcnames(ALZDAT)

608

621

609

622

610

#Full RAW Data

623

#Full RAW Data

611

Fullalzdwr <- ALZDAT %>%

624

Fullalzdwr <- ALZDAT %>%

612

as.data.frame(.,stringsAsFactors = FALSE) %>%

625

as.data.frame(.,stringsAsFactors = FALSE) %>%

613

cbind(ALZWORDF,.)

626

cbind(ALZWORDF,.)

614

627

615

#Raw file is output

628

#Raw file is output

616

nfnaex <- strsplit(alz,"[\\]") %>%

629

nfnaex <- strsplit(alz,"[\\]") %>%

617

.[[1]] %>%

630

.[[1]] %>%

618

.[length(.)] %>%

631

.[length(.)] %>%

619

gsub("\\D","",.) %>%

632

gsub("\\D","",.) %>%

620

c("GSE",.,"aftexcel.txt") %>%

633

c("GSE",.,"aftexcel.txt") %>%

621

paste(collapse = "")

634

paste(collapse = "")

622

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

635

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

623

636

624

637

625

638

626

#Now for the discretization part

639

#Now for the discretization part

627

##get the wordy part again

640

##get the wordy part again

628

rawword <- t(ALZWORDF)

641

rawword <- t(ALZWORDF)

629

642

630

##where is ID_REF located

643

##where is ID_REF located

631

hereim <- grep("ID_REF",rownames(rawword))

644

hereim <- grep("ID_REF",rownames(rawword))

632

645

633

##Subject Names GSM...

646

##Subject Names GSM...

634

subjnam <- rawword[hereim,]

647

subjnam <- rawword[hereim,]

635

648

636

##Getting the names for the rows

649

##Getting the names for the rows

637

namedarows <- rownames(rawword)[-hereim] %>%

650

namedarows <- rownames(rawword)[-hereim] %>%

638

as.data.frame(.,stringsAsFactors = FALSE)

651

as.data.frame(.,stringsAsFactors = FALSE)

639

RAWWORD <- rawword[-hereim,] %>%

652

RAWWORD <- rawword[-hereim,] %>%

640

as.data.frame(.,stringsAsFactors = FALSE) %>%

653

as.data.frame(.,stringsAsFactors = FALSE) %>%

641

bind_cols(namedarows,.)

654

bind_cols(namedarows,.)

642

z <- 1

655

z <- 1

643

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

656

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

644

for(z in 1:dim(RAWWORD)[1]){

657

for(z in 1:dim(RAWWORD)[1]){

645

if(sum(is.na(RAWWORD[z,])) > 0){

658

if(sum(is.na(RAWWORD[z,])) > 0){

646

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

659

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

647

}

660

}

648

if(length(grep("NA",RAWWORD[z,])) > 0){

661

if(length(grep("NA",RAWWORD[z,])) > 0){

649

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

662

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

650

}

663

}

651

z <- z + 1

664

z <- z + 1

652

}

665

}

653

666

654

colnames(naroww) <- "ROW_NAs"

667

colnames(naroww) <- "ROW_NAs"

655

RAWWORD <- bind_cols(RAWWORD,naroww)

668

RAWWORD <- bind_cols(RAWWORD,naroww)

656

669

657

670

658

roALZna <- t(ALZDAT) %>%

671

roALZna <- t(ALZDAT) %>%

659

rownames(.) %>%

672

rownames(.) %>%

660

as.data.frame(.,stringsAsFactors = FALSE)

673

as.data.frame(.,stringsAsFactors = FALSE)

661

colnames(roALZna) <- "ID_REF"

674

colnames(roALZna) <- "ID_REF"

662

675

663

RAWDAT <- t(ALZDAT) %>%

676

RAWDAT <- t(ALZDAT) %>%

664

as.data.frame(.,stringsAsFactors = FALSE)

677

as.data.frame(.,stringsAsFactors = FALSE)

665

colnames(RAWDAT) <- NULL

678

colnames(RAWDAT) <- NULL

666

rownames(RAWDAT) <- NULL

679

rownames(RAWDAT) <- NULL

667

680

668

RAWDAT2 <- RAWDAT %>%

681

RAWDAT2 <- RAWDAT %>%

669

cbind(roALZna,.) %>%

682

cbind(roALZna,.) %>%

670

dplyr::arrange(.,ID_REF)

683

dplyr::arrange(.,ID_REF)

671

684

672

##Editing the file for R processing

685

##Editing the file for R processing

673

RAWDATID <- RAWDAT2[,1] %>%

686

RAWDATID <- RAWDAT2[,1] %>%

674

as.matrix(.)

687

as.matrix(.)

675

688

676

RAWDATNUM <- RAWDAT2[,-1] %>%

689

RAWDATNUM <- RAWDAT2[,-1] %>%

677

mapply(.,FUN = as.numeric) %>%

690

mapply(.,FUN = as.numeric) %>%

678

t(.)

691

t(.)

679

692

680

##Consolidating genes with the same name

693

##Consolidating genes with the same name

681

###create empty matrix of size equal to tabRDATID

694

###create empty matrix of size equal to tabRDATID

682

tabRDATID <- table(RAWDATID)

695

tabRDATID <- table(RAWDATID)

683

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

696

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

684

j <- 1

697

j <- 1

685

for(j in 1:length(tabRDATID)){

698

for(j in 1:length(tabRDATID)){

686

##Putting the ones without duplicates in their new homes

699

##Putting the ones without duplicates in their new homes

687

if(tabRDATID[j] == 1){

700

if(tabRDATID[j] == 1){

688

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

701

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

689

} else if(tabRDATID[j] > 1){

702

} else if(tabRDATID[j] > 1){

690

##Averaging duplicates and putting them in their new homes

703

##Averaging duplicates and putting them in their new homes

691

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

704

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

692

}

705

}

693

j <- j + 1

706

j <- j + 1

694

}

707

}

695

708

696

##Scaling the Data

709

##Scaling the Data

697

scrawdat <- NuRDATN%>%

710

scrawdat <- NuRDATN%>%

698

scale()

711

scale()

699

attr(scrawdat,"scaled:center") <- NULL

712

attr(scrawdat,"scaled:center") <- NULL

700

attr(scrawdat,"scaled:scale") <- NULL

713

attr(scrawdat,"scaled:scale") <- NULL

701

colnames(scrawdat) <- rownames(tabRDATID)

714

colnames(scrawdat) <- rownames(tabRDATID)

702

715

716

#Outputting the Z-score file

717

nfnzsc <- strsplit(alz,"[\\]") %>%

718

.[[1]] %>%

719

.[length(.)] %>%

720

gsub("\\D","",.) %>%

721

c("GSE",.,"zscore.txt") %>%

722

paste(collapse = "")

723

zscraw <- scrawdat %>%

724

t()%>%

725

as.data.frame(.,stringsAsFactors = FALSE)

726

colnames(zscraw) <- subjnam

727

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)

728

703

##Discretized the Data

729

##Discretized the Data

704

dialzdat <- scrawdat %>%

730

dialzdat <- scrawdat %>%

705

dndat(.) %>%

731

dndat(.) %>%

706

t()%>%

732

t()%>%

707

as.data.frame(.,stringsAsFactors = FALSE)

733

as.data.frame(.,stringsAsFactors = FALSE)

708

colnames(dialzdat) <- rownames(RAWDATNUM)

734

colnames(dialzdat) <- rownames(RAWDATNUM)

709

735

710

##setting "ID_REF" as a new variable

736

##setting "ID_REF" as a new variable

711

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

737

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

712

colnames(geneNAM) <- "ID_REF"

738

colnames(geneNAM) <- "ID_REF"

713

rownames(dialzdat) <- NULL

739

rownames(dialzdat) <- NULL

714

dialzdat <-bind_cols(geneNAM,dialzdat)

740

dialzdat <-bind_cols(geneNAM,dialzdat)

715

741

716

##NAs in a column

742

##NAs in a column

717

x <- 2

743

x <- 2

718

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

744

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

719

nacol[1,1] = "COL_NAs"

745

nacol[1,1] = "COL_NAs"

720

for(x in 2:dim(dialzdat)[2]){

746

for(x in 2:dim(dialzdat)[2]){

721

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

747

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

722

x <- x + 1

748

x <- x + 1

723

}

749

}

724

colnames(nacol) <- colnames(dialzdat)

750

colnames(nacol) <- colnames(dialzdat)

725

dialzdat <- bind_rows(dialzdat,nacol)

751

dialzdat <- bind_rows(dialzdat,nacol)

726

752

727

##NAs in a row

753

##NAs in a row

728

y <- 1

754

y <- 1

729

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

755

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

730

for(y in 1:dim(dialzdat)[1]){

756

for(y in 1:dim(dialzdat)[1]){

731

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

757

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

732

y <- y + 1

758

y <- y + 1

733

}

759

}

734

colnames(narowd) <- "ROW_NAs"

760

colnames(narowd) <- "ROW_NAs"

735

dialzdat <- bind_cols(dialzdat,narowd)

761

dialzdat <- bind_cols(dialzdat,narowd)

736

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

762

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

737

colnames(RAWWORD) <- colnames(dialzdat)

763

colnames(RAWWORD) <- colnames(dialzdat)

738

##converting to character so that the clinical can be brought together with discrete data

764

##converting to character so that the clinical can be brought together with discrete data

739

k <- 2

765

k <- 2

740

for(k in 2:dim(dialzdat)[2]-1){

766

for(k in 2:dim(dialzdat)[2]-1){

741

dialzdat[,k] <- as.character(dialzdat[,k])

767

dialzdat[,k] <- as.character(dialzdat[,k])

742

k <- k + 1

768

k <- k + 1

743

}

769

}

744

#The End the full data

770

#The End the full data

745

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

771

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

746

772

747

#Produces Discrete file

773

#Produces Discrete file

748

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

774

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

749

.[[1]] %>%

775

.[[1]] %>%

750

.[length(.)] %>%

776

.[length(.)] %>%

751

gsub("\\D","",.) %>%

777

gsub("\\D","",.) %>%

752

c("GSE",.,"dscrt.txt") %>%

778

c("GSE",.,"dscrt.txt") %>%

753

paste(collapse = "")

779

paste(collapse = "")

754

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

780

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

755

781

756

782

757

n <- n + 1

783

n <- n + 1

758

}

784

}

759

}

785

}

760

}

786

}

761

}

787

}

762

#The Rest of this code will be used every time you want to change a data set

788

#The Rest of this code will be used every time you want to change a data set

763

THEFT()

789

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Updated Now outputting z-score file as well

 #Efrain H. Gonzalez
 #6/22/2017
 options(digits = 11)
 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	e <- 1
 	r <- 1
 	a <- 1
 	h <- 1
 	g <- 1
 	o <- 1
 	for(e in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,e]){
 			colnames(mat)[e] <- "Brain_Region"
 		} else if("!Sample_title" == mat[1,e]){
 			colnames(mat)[e] <- "Title"
 		} else if("!Sample_geo_accession" == mat[1,e]){
 			colnames(mat)[e] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Sex",r)
 				r = r + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("PMI",a)
 				a = a + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Age",h)
 				h = h + 1
 			 }
 			if(grepl("braak|b&b",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Braak",g)
 				g = g + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){
 				colnames(mat)[e] <- paste0("Group",o)
 				o = o + 1
 			}
 		}
 		e = e + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		} else if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		} else if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		} else if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		} else if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
 	nj <- t(GeneName)
 	nq <- t(DATA)
 	colGene <- dim(nj)[2]
 	colDATA <- dim(nq)[2]
 	j <- 1
 	for(j in 1:colDATA){
 		#where is that gene id located within the GPL file
 		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
 		if(is.na(sum(chngreq))==FALSE){
 			if(sum(chngreq) > 0){
 			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
 			}
 		}
 		j <- j + 1
 	}
 	nq
 }
 #cgeneID <- function(GeneName,DATA){
 #    colGene <- dim(GeneName)[2]
 #     j <- 1
 #     for(j in 1:colGene){
 #	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
 #	if(is.na(sum(chngsreq))==FALSE){
 #		if(sum(chngsreq) > 0){
 #			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 #		}
 #	}
 #	j = j+1
 #	}
 #	DATA
 #}
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])
 		} else{
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])
 		}
 	}
 	nwnam
 }
 #6# Function for discretizing the data
 dndat <- function(NDATA){
 	rownd <- dim(NDATA)[1]
 	colnd <- dim(NDATA)[2]
 	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
 	colnames(DDATA) <- colnames(NDATA)
 	i <- 1
 	for(i in 1:rownd){
 		j <- 1
 		for(j in 1:colnd){
 			if(is.na(NDATA[i,j])==FALSE){
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
 				} else if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
 				} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){
 					DDATA[i,j]=1L
 				}
 			} else{
 				DDATA[i,j] = NDATA[i,j]
 			}
 			j = j + 1
 		}
 		i = i + 1
 	}
 	DDATA
 }
 #MajorFunction#This is the function that does everything else
 THEFT <- function(){
 	#Set working directory based on the directory of the series matrix file Currently only works for windows
 	wd <- getwd()
 	#list.files()
 	#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")
 	numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)
 	GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())
 	GSEfloc <- list.files()[GSEfileloc]
 	#ALL DATA FILES WILL BE CLEANED
 	if(numDAT == 1){
 		#indexing the data files
 		n <- 1
 		for(n in 1: length(GSEfloc)){
 			alz <- GSEfloc[n]
 			#Working with the wordy part of the document
 			alzword <- alz %>%
 				read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 				filter(grepl("!Sample",X1))%>%
 				filter(!grepl("!Sample_contact",X1))
 			#Getting the GPL file
 			genena <- grep("_platform_id",alzword$X1) %>%
 				alzword$X2[.] %>%
 				str_trim(.) %>%
 				paste0("^",.,"\\D") %>%
 				grep(.,list.files()) %>%
 				list.files()[.]
 			#Find out if it is a soft GPL file or not
 			soft <- strsplit(genena,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				grepl("soft",.)
 			##Changing row names and column names:
 			ALZWORD <- t(alzword)
 			rownames(ALZWORD)=NULL
 			colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 			ALZWORD <- chngrownm(ALZWORD)[-1,]
 			ALZWORD <- ALZWORD%>%
 				as.data.frame(.,stringsAsFactors = FALSE)%>%
 				dplyr::select(-starts_with("col"))
 			##Reorganizing information within the columns and final clinical data
 			ALZWORDF <- cinfo(ALZWORD)
 			#Working with Actual Data part of file
 			alzdat <- alz %>%
 				read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 			ALZDAT <- t(alzdat[,-1])
 			rownames(ALZDAT)=NULL
 			##Is there a clean version of the GPL file available?
 			gplnum <- strsplit(genena,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.)
 			clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 			if(clfileex >= 1){
 			#use the clean version
 			geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 				read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 			} else if(clfileex == 0){
 			##Lets Create a clean version
 			##Gene ID to Gene Name
 				if(soft == TRUE){
 					#Check to see if there is already a file containing information on soft files
 					fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 					if(fileex == 1){
 						#Check to see if this GPL soft file has been used before
 						IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 							.$GPL_FILE_NUM%>%
 							grepl(gplnum,.) %>%
 							sum()
 						if(IDF == 1){
 							IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 								.$GPL_FILE_NUM%>%
 								grep(gplnum,.)
 							idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 									.$LOC_ID %>%
 									.[IDLOCAL]
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 						} else if(IDF == 0){
 							#No information on this particular GPL file
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 								t(.) %>%
 								grep("^ID\\s*$",.) %>%
 								-1
 							cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 								cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 						}
 					} else if(fileex == 0){
 						#We must create a file that we can access for later use
 						idLOCGPL <- genena %>%
 							read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 							t(.) %>%
 							grep("^ID\\s*$",.) %>%
 							-1
 						Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 						colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 						write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 					}
 	 			} else if(soft == FALSE){
 					geneIDNam <- genena %>%
 						read_delim(delim="\t",comment = "#")%>%
 						dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 				}
 				##Labeling the gene IDs without names
 				geneIDNam <- NAFIXING(geneIDNam)
 				##remove the whitespace
 				geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 				##Here is the clean version
 				write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 			}
 			##Changing the gene ID to gene name
 			ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 			colnames(ALZDAT) = ALZDAT1[1,]
 			##Adjusting the column names aka the gene names
 			colnames(ALZDAT) <- gcnames(ALZDAT)
 			#Full RAW Data
 			Fullalzdwr <- ALZDAT %>%
 				as.data.frame(.,stringsAsFactors = FALSE) %>%
 				cbind(ALZWORDF,.)
 			#Raw file is output
 			nfnaex <- strsplit(alz,"[\\]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.) %>%
 				c("GSE",.,"aftexcel.txt") %>%
 				paste(collapse = "")
 			write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 			#Now for the discretization part
 			##get the wordy part again
 			rawword <- t(ALZWORDF)
 			##where is ID_REF located
 			hereim <- grep("ID_REF",rownames(rawword))
 			##Subject Names GSM...
 			subjnam <- rawword[hereim,]
 			##Getting the names for the rows
 			namedarows <- rownames(rawword)[-hereim] %>%
 				as.data.frame(.,stringsAsFactors = FALSE)
 			RAWWORD <- rawword[-hereim,] %>%
 				as.data.frame(.,stringsAsFactors = FALSE) %>%
 				bind_cols(namedarows,.)
 			z <- 1
 			naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 			for(z in 1:dim(RAWWORD)[1]){
                 if(sum(is.na(RAWWORD[z,])) > 0){
 				    naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 				}
 				if(length(grep("NA",RAWWORD[z,])) > 0){
                 naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
                 }
                 z <- z + 1
             }
 			colnames(naroww) <- "ROW_NAs"
 			RAWWORD <- bind_cols(RAWWORD,naroww)
 			roALZna <- t(ALZDAT) %>%
 				rownames(.) %>%
 				as.data.frame(.,stringsAsFactors = FALSE)
 			colnames(roALZna) <- "ID_REF"
 			RAWDAT <- t(ALZDAT) %>%
 				as.data.frame(.,stringsAsFactors = FALSE)
 			colnames(RAWDAT) <- NULL
 			rownames(RAWDAT) <- NULL
 			RAWDAT2 <- RAWDAT %>%
 				cbind(roALZna,.) %>%
 				dplyr::arrange(.,ID_REF)
 			##Editing the file for R processing
 			RAWDATID <- RAWDAT2[,1] %>%
 				as.matrix(.)
 			RAWDATNUM <- RAWDAT2[,-1] %>%
 				mapply(.,FUN = as.numeric) %>%
 				t(.)
 			##Consolidating genes with the same name
 			###create empty matrix of size equal  to tabRDATID
 			tabRDATID <- table(RAWDATID)
 			NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 			j <- 1
 			for(j in 1:length(tabRDATID)){
 				##Putting the ones without duplicates in their new homes
 				if(tabRDATID[j] == 1){
 					NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 				} else if(tabRDATID[j] > 1){
 				    ##Averaging duplicates and putting them in their new homes
 					NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 				}
 				j <- j + 1
 			}
 			##Scaling the Data
 			scrawdat <- NuRDATN%>%
 				scale()
 			attr(scrawdat,"scaled:center") <- NULL
 			attr(scrawdat,"scaled:scale") <- NULL
 			colnames(scrawdat) <- rownames(tabRDATID)
+			#Outputting the Z-score file
+			nfnzsc <- strsplit(alz,"[\\]") %>%
+	            .[[1]] %>%
+            	.[length(.)] %>%
+            	gsub("\\D","",.) %>%
+	            c("GSE",.,"zscore.txt") %>%
+	            paste(collapse = "")
+            zscraw <- scrawdat %>%
+	            t()%>%
+	            as.data.frame(.,stringsAsFactors = FALSE)
+            colnames(zscraw) <- subjnam
+            write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
 			##Discretized the Data
 			dialzdat <- scrawdat %>%
 				dndat(.) %>%
 				t()%>%
 				as.data.frame(.,stringsAsFactors = FALSE)
 			colnames(dialzdat) <- rownames(RAWDATNUM)
 			##setting "ID_REF" as a new variable
 			geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
 			colnames(geneNAM) <- "ID_REF"
 			rownames(dialzdat) <- NULL
 			dialzdat <-bind_cols(geneNAM,dialzdat)
 			##NAs in a column
 			x <- 2
 			nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 			nacol[1,1] = "COL_NAs"
 			for(x in 2:dim(dialzdat)[2]){
 				nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 				x <- x + 1
 			}
 			colnames(nacol) <- colnames(dialzdat)
 			dialzdat <- bind_rows(dialzdat,nacol)
 			##NAs in a row
 			y <- 1
 			narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 			for(y in 1:dim(dialzdat)[1]){
 				narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 				y <- y + 1
 			}
 			colnames(narowd) <- "ROW_NAs"
 			dialzdat <- bind_cols(dialzdat,narowd)
 			colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 			colnames(RAWWORD) <- colnames(dialzdat)
 			##converting to character so that the clinical can be brought together with discrete data
 			k <- 2
 			for(k in 2:dim(dialzdat)[2]-1){
 				dialzdat[,k] <- as.character(dialzdat[,k])
 				k <- k + 1
 			}
 			#The End the full data
 			Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 			#Produces Discrete file
 			nfnaex2 <- strsplit(alz,"[\\|/]") %>%
 				.[[1]] %>%
 				.[length(.)] %>%
 				gsub("\\D","",.) %>%
 				c("GSE",.,"dscrt.txt") %>%
 				paste(collapse = "")
 			write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 			n <- n +1
 		}
 	} else if(numDAT == 2){
 	#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN
 		#All the files you want to analyze
 		ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")
 		if(length(ANDIS) == 0){
 			#Spit out a warning
 			warning("You did not select any files and so no cleaning will be performed")
 		} else{
 			#indexing the data files
 			n <- 1
 			for(n in 1: length(ANDIS)){
 				alz <- ANDIS[n]
 				#Working with the wordy part of the document
 				alzword <- alz %>%
 					read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 					filter(grepl("!Sample",X1))%>%
 					filter(!grepl("!Sample_contact",X1))
 				#Getting the GPL file
 				genena <- grep("_platform_id",alzword$X1) %>%
 					alzword$X2[.] %>%
 					str_trim(.) %>%
 					paste0("^",.,"\\D") %>%
 					grep(.,list.files()) %>%
 					list.files()[.]
 				#Find out if it is a soft GPL file or not
 				soft <- strsplit(genena,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					grepl("soft",.)
 				##Changing row names and column names:
 				ALZWORD <- t(alzword)
 				rownames(ALZWORD)=NULL
 				colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 				ALZWORD <- chngrownm(ALZWORD)[-1,]
 				ALZWORD <- ALZWORD%>%
 					as.data.frame(.,stringsAsFactors = FALSE)%>%
 					dplyr::select(-starts_with("col"))
 				##Reorganizing information within the columns and final clinical data
 				ALZWORDF <- cinfo(ALZWORD)
 				#Working with Actual Data part of file
 				alzdat <- alz %>%
 					read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 				ALZDAT <- t(alzdat[,-1])
 				rownames(ALZDAT)=NULL
 				##Is there a clean version of the GPL file available?
 				gplnum <- strsplit(genena,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.)
 				clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 				if(clfileex >= 1){
 				#use the clean version
 				geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 					read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 				} else if(clfileex == 0){
 				##Lets Create a clean version
 				##Gene ID to Gene Name
 					if(soft == TRUE){
 						#Check to see if there is already a file containing information on soft files
 						fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 						if(fileex == 1){
 							#Check to see if this GPL soft file has been used before
 							IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 								.$GPL_FILE_NUM%>%
 								grepl(gplnum,.) %>%
 								sum()
 							if(IDF == 1){
 								IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 									.$GPL_FILE_NUM%>%
 									grep(gplnum,.)
 								idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 										.$LOC_ID %>%
 										.[IDLOCAL]
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 							} else if(IDF == 0){
 								#No information on this particular GPL file
 								idLOCGPL <- genena %>%
 									read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 									t(.) %>%
 									grep("^ID\\s*$",.) %>%
 									-1
 								cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 									cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 								geneIDNam <- genena %>%
 									read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 									dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 							}
 						} else if(fileex == 0){
 							#We must create a file that we can access for later use
 							idLOCGPL <- genena %>%
 								read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 								t(.) %>%
 								grep("^ID\\s*$",.) %>%
 								-1
 							Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 							colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 							write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 							geneIDNam <- genena %>%
 								read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 								dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 						}
 		 			} else if(soft == FALSE){
 						geneIDNam <- genena %>%
 							read_delim(delim="\t",comment = "#")%>%
 							dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))
 					}
 					##Labeling the gene IDs without names
 						geneIDNam <- NAFIXING(geneIDNam)
 					##remove the whitespace
 					geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 					##Here is the clean version
 					write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 				}
 				##Changing the gene ID to gene name
 				ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 				colnames(ALZDAT) = ALZDAT1[1,]
 				##Adjusting the column names aka the gene names
 				colnames(ALZDAT) <- gcnames(ALZDAT)
 				#Full RAW Data
 				Fullalzdwr <- ALZDAT %>%
 					as.data.frame(.,stringsAsFactors = FALSE) %>%
 					cbind(ALZWORDF,.)
 				#Raw file is output
 				nfnaex <- strsplit(alz,"[\\]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.) %>%
 					c("GSE",.,"aftexcel.txt") %>%
 					paste(collapse = "")
 				write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 				#Now for the discretization part
 				##get the wordy part again
 				rawword <- t(ALZWORDF)
 				##where is ID_REF located
 				hereim <- grep("ID_REF",rownames(rawword))
 				##Subject Names GSM...
 				subjnam <- rawword[hereim,]
 				##Getting the names for the rows
 				namedarows <- rownames(rawword)[-hereim] %>%
 					as.data.frame(.,stringsAsFactors = FALSE)
 				RAWWORD <- rawword[-hereim,] %>%
 					as.data.frame(.,stringsAsFactors = FALSE) %>%
 					bind_cols(namedarows,.)
 				z <- 1
 				naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 				for(z in 1:dim(RAWWORD)[1]){
                     if(sum(is.na(RAWWORD[z,])) > 0){
                         naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
                     }
 					if(length(grep("NA",RAWWORD[z,])) > 0){
                         naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
                     }
                     z <- z + 1
                 }
 				colnames(naroww) <- "ROW_NAs"
 				RAWWORD <- bind_cols(RAWWORD,naroww)
 				roALZna <- t(ALZDAT) %>%
 					rownames(.) %>%
 					as.data.frame(.,stringsAsFactors = FALSE)
 				colnames(roALZna) <- "ID_REF"
 				RAWDAT <- t(ALZDAT) %>%
 					as.data.frame(.,stringsAsFactors = FALSE)
 				colnames(RAWDAT) <- NULL
 				rownames(RAWDAT) <- NULL
 				RAWDAT2 <- RAWDAT %>%
 					cbind(roALZna,.) %>%
 					dplyr::arrange(.,ID_REF)
 				##Editing the file for R processing
 				RAWDATID <- RAWDAT2[,1] %>%
 					as.matrix(.)
 				RAWDATNUM <- RAWDAT2[,-1] %>%
 					mapply(.,FUN = as.numeric) %>%
 					t(.)
 				##Consolidating genes with the same name
 				###create empty matrix of size equal  to tabRDATID
 				tabRDATID <- table(RAWDATID)
 				NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 				j <- 1
 				for(j in 1:length(tabRDATID)){
 					##Putting the ones without duplicates in their new homes
 					if(tabRDATID[j] == 1){
 						NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 					} else if(tabRDATID[j] > 1){
 					##Averaging duplicates and putting them in their new homes
 						NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 					}
 					j <- j + 1
 				}
 				##Scaling the Data
 				scrawdat <- NuRDATN%>%
 					scale()
 				attr(scrawdat,"scaled:center") <- NULL
 				attr(scrawdat,"scaled:scale") <- NULL
 				colnames(scrawdat) <- rownames(tabRDATID)
+				#Outputting the Z-score file
+			    nfnzsc <- strsplit(alz,"[\\]") %>%
+                    .[[1]] %>%
+                    .[length(.)] %>%
+                    gsub("\\D","",.) %>%
+                    c("GSE",.,"zscore.txt") %>%
+                    paste(collapse = "")
+                zscraw <- scrawdat %>%
+                    t()%>%
+                    as.data.frame(.,stringsAsFactors = FALSE)
+                colnames(zscraw) <- subjnam
+                write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)
 				##Discretized the Data
 				dialzdat <- scrawdat %>%
 					dndat(.) %>%
 					t()%>%
 					as.data.frame(.,stringsAsFactors = FALSE)
 				colnames(dialzdat) <- rownames(RAWDATNUM)
 				##setting "ID_REF" as a new variable
 				geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)
 				colnames(geneNAM) <- "ID_REF"
 				rownames(dialzdat) <- NULL
 				dialzdat <-bind_cols(geneNAM,dialzdat)
 				##NAs in a column
 				x <- 2
 				nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 				nacol[1,1] = "COL_NAs"
 				for(x in 2:dim(dialzdat)[2]){
 					nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 					x <- x + 1
 				}
 				colnames(nacol) <- colnames(dialzdat)
 				dialzdat <- bind_rows(dialzdat,nacol)
 				##NAs in a row
 				y <- 1
 				narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 				for(y in 1:dim(dialzdat)[1]){
 					narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 					y <- y + 1
 				}
 				colnames(narowd) <- "ROW_NAs"
 				dialzdat <- bind_cols(dialzdat,narowd)
 				colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 				colnames(RAWWORD) <- colnames(dialzdat)
 				##converting to character so that the clinical can be brought together with discrete data
 				k <- 2
 				for(k in 2:dim(dialzdat)[2]-1){
 					dialzdat[,k] <- as.character(dialzdat[,k])
 					k <- k + 1
 				}
 				#The End the full data
 				Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 				#Produces Discrete file
 				nfnaex2 <- strsplit(alz,"[\\|/]") %>%
 					.[[1]] %>%
 					.[length(.)] %>%
 					gsub("\\D","",.) %>%
 					c("GSE",.,"dscrt.txt") %>%
 					paste(collapse = "")
 				write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)
 				n <- n + 1
 			}
 		}
 	}
 }
 #The Rest of this code will be used every time you want to change a data set
 THEFT()