Efrain Gonzalez / Cleaning and Fixing Data with R

1

########################################################################

1

########################################################################

2

# Don't Use This Code Just Yet #

2

# Don't Use This Code Just Yet #

3

########################################################################

3

########################################################################

4

#Efrain H. Gonzalez

4

#Efrain H. Gonzalez

5

#6/16/2017

5

#6/16/2017

6

7

#Libraries required to run the code

7

#Libraries required to run the code

8

library(pryr)

8

library(pryr)

9

library(MASS)

9

library(MASS)

10

library(dplyr)

10

library(dplyr)

11

library(tidyr)

11

library(tidyr)

12

library(readr)

12

library(readr)

13

library(stringr)

13

library(stringr)

14

15

16

#Necessary Functions

16

#Necessary Functions

17

#1#Function for handling the changing of row names and column names

17

#1#Function for handling the changing of row names and column names

18

chngrownm <- function(mat){

18

chngrownm <- function(mat){

19

row <- dim(mat)[1]

19

row <- dim(mat)[1]

20

col <- dim(mat)[2]

20

col <- dim(mat)[2]

21

e <- 1

21

e <- 1

22

r <- 1

22

r <- 1

23

a <- 1

23

a <- 1

24

h <- 1

24

h <- 1

25

g <- 1

25

g <- 1

26

o <- 1

26

o <- 1

27

for(e in 1:col){

27

for(e in 1:col){

28

if("!Sample_source_name_ch1"==mat[1,e]){

28

if("!Sample_source_name_ch1"==mat[1,e]){

29

colnames(mat)[e] <- "Brain_Region"

29

colnames(mat)[e] <- "Brain_Region"

30

}

30

}

31

else if("!Sample_title" == mat[1,e]){

31

else if("!Sample_title" == mat[1,e]){

32

colnames(mat)[e] <- "Title"

32

colnames(mat)[e] <- "Title"

33

}

33

}

34

else if("!Sample_geo_accession" == mat[1,e]){

34

else if("!Sample_geo_accession" == mat[1,e]){

35

colnames(mat)[e] <- "ID_REF"

35

colnames(mat)[e] <- "ID_REF"

36

} else{

36

} else{

37

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

37

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

38

colnames(mat)[e] <- paste0("Sex",r)

38

colnames(mat)[e] <- paste0("Sex",r)

39

r = r + 1

39

r = r + 1

40

}

40

}

41

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

41

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

42

colnames(mat)[e] <- paste0("PMI",a)

42

colnames(mat)[e] <- paste0("PMI",a)

43

a = a + 1

43

a = a + 1

44

}

44

}

45

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

45

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

46

colnames(mat)[e] <- paste0("Age",h)

46

colnames(mat)[e] <- paste0("Age",h)

47

h = h + 1

47

h = h + 1

48

}

48

}

49

if(grepl("braak|b&b",mat[2,e])==TRUE){

49

if(grepl("braak|b&b",mat[2,e])==TRUE){

50

colnames(mat)[e] <- paste0("Braak",g)

50

colnames(mat)[e] <- paste0("Braak",g)

51

g = g + 1

51

g = g + 1

52

}

52

}

53

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

53

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

54

colnames(mat)[e] <- paste0("Group",o)

54

colnames(mat)[e] <- paste0("Group",o)

55

o = o + 1

55

o = o + 1

56

}

56

}

57

58

}

58

}

59

e = e + 1

59

e = e + 1

60

}

60

}

61

mat

61

mat

62

}

62

}

63

64

#2#Function for reorganizing information within the columns

64

#2#Function for reorganizing information within the columns

65

cinfo <- function(mat){

65

cinfo <- function(mat){

66

col <- dim(mat)[2]

66

col <- dim(mat)[2]

67

j <-2

67

j <-2

68

for(j in 2:col){

68

for(j in 2:col){

69

if(grepl("Group",colnames(mat)[j]) == TRUE){

69

if(grepl("Group",colnames(mat)[j]) == TRUE){

70

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

70

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

71

}

71

}

72

else if(grepl("Age",colnames(mat)[j])==TRUE){

72

else if(grepl("Age",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub("\\D","",mat[,j])%>%

73

mat[,j] <- gsub("\\D","",mat[,j])%>%

74

as.integer()

74

as.integer()

75

}

75

}

76

else if(grepl("Sex",colnames(mat)[j])==TRUE){

76

else if(grepl("Sex",colnames(mat)[j])==TRUE){

77

mat[,j] <- gsub(".+:\\s","",mat[,j])

77

mat[,j] <- gsub(".+:\\s","",mat[,j])

78

}

78

}

79

else if(grepl("PMI",colnames(mat)[j])==TRUE){

79

else if(grepl("PMI",colnames(mat)[j])==TRUE){

80

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

80

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

81

as.numeric()

81

as.numeric()

82

}

82

}

83

else if(grepl("Braak",colnames(mat)[j])==TRUE){

83

else if(grepl("Braak",colnames(mat)[j])==TRUE){

84

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

84

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

85

as.roman()%>%

85

as.roman()%>%

86

as.integer()

86

as.integer()

87

}

87

}

88

j=j+1

88

j=j+1

89

}

89

}

90

mat

90

mat

91

}

91

}

92

93

#3#Function for labeling the gene IDs without names

93

#3#Function for labeling the gene IDs without names

94

NAFIXING <- function(GIDNAM){

94

NAFIXING <- function(GIDNAM){

95

row <- dim(GIDNAM)[1]

95

row <- dim(GIDNAM)[1]

96

i <- 1

96

i <- 1

97

for(i in 1:row){

97

for(i in 1:row){

98

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

98

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

99

GIDNAM[i,2] <- GIDNAM[i,1]

99

GIDNAM[i,2] <- GIDNAM[i,1]

100

}

100

}

101

i <- i + 1

101

i <- i + 1

102

}

102

}

103

GIDNAM

103

GIDNAM

104

}

104

}

105

106

#4#Function for changing the gene ID to gene name

106

#4#Function for changing the gene ID to gene name

107

cgeneID <- function(GeneName,DATA){

107

cgeneID <- function(GeneName,DATA){

108

nj <- t(GeneName)

108

nj <- t(GeneName)

109

nq <- t(DATA)

109

nq <- t(DATA)

110

colGene <- dim(nj)[2]

110

colGene <- dim(nj)[2]

111

colDATA <- dim(nq)[2]

111

colDATA <- dim(nq)[2]

112

j <- 1

112

j <- 1

113

for(j in 1:colDATA){

113

for(j in 1:colDATA){

114

#where is that gene id located within the GPL file

114

#where is that gene id located within the GPL file

115

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

115

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

116

if(is.na(sum(chngreq))==FALSE){

116

if(is.na(sum(chngreq))==FALSE){

117

if(sum(chngreq) > 0){

117

if(sum(chngreq) > 0){

118

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

118

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

119

}

119

}

120

}

120

}

121

j <- j + 1

121

j <- j + 1

122

}

122

}

123

nq

123

nq

124

}

124

}

125

#cgeneID <- function(GeneName,DATA){

125

#cgeneID <- function(GeneName,DATA){

126

# colGene <- dim(GeneName)[2]

126

# colGene <- dim(GeneName)[2]

127

# j <- 1

127

# j <- 1

128

# for(j in 1:colGene){

128

# for(j in 1:colGene){

129

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

129

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

130

# if(is.na(sum(chngsreq))==FALSE){

130

# if(is.na(sum(chngsreq))==FALSE){

131

# if(sum(chngsreq) > 0){

131

# if(sum(chngsreq) > 0){

132

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

132

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# }

133

# }

134

# }

134

# }

135

# j = j+1

135

# j = j+1

136

# }

136

# }

137

# DATA

137

# DATA

138

#}

138

#}

139

140

#5#Function for adjusting the gene names

140

#5#Function for adjusting the gene names

141

gcnames <- function(DiData,usecol=1){

141

gcnames <- function(DiData,usecol=1){

142

nuruns <- dim(DiData)[2]

142

nuruns <- dim(DiData)[2]

143

i = 1

143

i = 1

144

nwnam <- rep("0",length.out=nuruns)

144

nwnam <- rep("0",length.out=nuruns)

145

for(i in 1:nuruns){

145

for(i in 1:nuruns){

146

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

146

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

147

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

147

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

148

} else{

148

} else{

149

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

149

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

150

}

150

}

151

152

}

152

}

153

nwnam

153

nwnam

154

155

}

155

}

156

157

#6# Function for discretizing the data

157

#6# Function for discretizing the data

158

dndat <- function(NDATA){

158

dndat <- function(NDATA){

159

rownd <- dim(NDATA)[1]

159

rownd <- dim(NDATA)[1]

160

colnd <- dim(NDATA)[2]

160

colnd <- dim(NDATA)[2]

161

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

161

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

162

colnames(DDATA) <- colnames(NDATA)

162

colnames(DDATA) <- colnames(NDATA)

163

i <- 1

163

i <- 1

164

for(i in 1:rownd){

164

for(i in 1:rownd){

165

j <- 1

165

j <- 1

166

for(j in 1:colnd){

166

for(j in 1:colnd){

167

if(is.na(NDATA[i,j])==FALSE){

167

if(is.na(NDATA[i,j])==FALSE){

168

169

if(NDATA[i,j] < -1){

169

if(NDATA[i,j] < -1){

170

DDATA[i,j]=0L

170

DDATA[i,j]=0L

171

}

171

}

172

if(NDATA[i,j] > 1){

172

else if(NDATA[i,j] > 1){

173

DDATA[i,j]=2L

173

DDATA[i,j]=2L

174

}

174

}

175

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

175

else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

176

DDATA[i,j]=1L

176

DDATA[i,j]=1L

177

}

177

}

178

} else{

178

} else{

179

DDATA[i,j] = NDATA[i,j]

179

DDATA[i,j] = NDATA[i,j]

180

}

180

}

181

j = j + 1

181

j = j + 1

182

}

182

}

183

i = i + 1

183

i = i + 1

184

}

184

}

185

DDATA

185

DDATA

186

}

186

}

187

188

189

#MajorFunction#This is the function that does everything else

189

#MajorFunction#This is the function that does everything else

190

THEFT <- function(){

190

THEFT <- function(){

191

#Set working directory based on the directory of the series matrix file Currently only works for windows

191

#Set working directory based on the directory of the series matrix file Currently only works for windows

192

wd <- getwd()

192

wd <- getwd()

193

#list.files()

193

#list.files()

194

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

194

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

195

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

195

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

196

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

196

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

197

GSEfloc <- list.files()[GSEfileloc]

197

GSEfloc <- list.files()[GSEfileloc]

198

#ALL DATA FILES WILL BE CLEANED

198

#ALL DATA FILES WILL BE CLEANED

199

if(numDAT == 1){

199

if(numDAT == 1){

200

#indexing the data files

200

#indexing the data files

201

n <- 1

201

n <- 1

202

for(n in 1: length(GSEfloc)){

202

for(n in 1: length(GSEfloc)){

203

alz <- GSEfloc[n]

203

alz <- GSEfloc[n]

204

205

#Working with the wordy part of the document

205

#Working with the wordy part of the document

206

alzword <- alz %>%

206

alzword <- alz %>%

207

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

207

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

208

filter(grepl("!Sample",X1))%>%

208

filter(grepl("!Sample",X1))%>%

209

filter(!grepl("!Sample_contact",X1))

209

filter(!grepl("!Sample_contact",X1))

210

211

#Getting the GPL file

211

#Getting the GPL file

212

genena <- grep("_platform_id",alzword$X1) %>%

212

genena <- grep("_platform_id",alzword$X1) %>%

213

alzword$X2[.] %>%

213

alzword$X2[.] %>%

214

str_trim(.) %>%

214

str_trim(.) %>%

215

paste0("^",.,"\\D") %>%

215

paste0("^",.,"\\D") %>%

216

grep(.,list.files()) %>%

216

grep(.,list.files()) %>%

217

list.files()[.]

217

list.files()[.]

218

219

#Find out if it is a soft GPL file or not

219

#Find out if it is a soft GPL file or not

220

soft <- strsplit(genena,"[\\|/]") %>%

220

soft <- strsplit(genena,"[\\|/]") %>%

221

.[[1]] %>%

221

.[[1]] %>%

222

.[length(.)] %>%

222

.[length(.)] %>%

223

grepl("soft",.)

223

grepl("soft",.)

224

225

##Changing row names and column names:

225

##Changing row names and column names:

226

ALZWORD <- t(alzword)

226

ALZWORD <- t(alzword)

227

rownames(ALZWORD)=NULL

227

rownames(ALZWORD)=NULL

228

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

228

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

229

ALZWORD <- chngrownm(ALZWORD)[-1,]

229

ALZWORD <- chngrownm(ALZWORD)[-1,]

230

ALZWORD <- ALZWORD%>%

230

ALZWORD <- ALZWORD%>%

231

as.data.frame()%>%

231

as.data.frame()%>%

232

dplyr::select(-starts_with("col"))

232

dplyr::select(-starts_with("col"))

233

234

##Reorganizing information within the columns and final clinical data

234

##Reorganizing information within the columns and final clinical data

235

ALZWORDF <- cinfo(ALZWORD)

235

ALZWORDF <- cinfo(ALZWORD)

236

237

238

#Working with Actual Data part of file

238

#Working with Actual Data part of file

239

alzdat <- alz %>%

239

alzdat <- alz %>%

240

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

240

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

241

ALZDAT <- t(alzdat[,-1])

241

ALZDAT <- t(alzdat[,-1])

242

rownames(ALZDAT)=NULL

242

rownames(ALZDAT)=NULL

243

244

##Is there a clean version of the GPL file available?

244

##Is there a clean version of the GPL file available?

245

gplnum <- strsplit(genena,"[\\|/]") %>%

245

gplnum <- strsplit(genena,"[\\|/]") %>%

246

.[[1]] %>%

246

.[[1]] %>%

247

.[length(.)] %>%

247

.[length(.)] %>%

248

gsub("\\D","",.)

248

gsub("\\D","",.)

249

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

249

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

250

if(clfileex >= 1){

250

if(clfileex >= 1){

251

#use the clean version

251

#use the clean version

252

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

252

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

253

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

253

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

254

255

}

255

}

256

else if(clfileex == 0){

256

else if(clfileex == 0){

257

##Lets Create a clean version

257

##Lets Create a clean version

258

259

##Gene ID to Gene Name

259

##Gene ID to Gene Name

260

if(soft == TRUE){

260

if(soft == TRUE){

261

#Check to see if there is already a file containing information on soft files

261

#Check to see if there is already a file containing information on soft files

262

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

262

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

263

if(fileex == 1){

263

if(fileex == 1){

264

#Check to see if this GPL soft file has been used before

264

#Check to see if this GPL soft file has been used before

265

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

265

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

266

.$GPL_FILE_NUM%>%

266

.$GPL_FILE_NUM%>%

267

grepl(gplnum,.) %>%

267

grepl(gplnum,.) %>%

268

sum()

268

sum()

269

if(IDF == 1){

269

if(IDF == 1){

270

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

270

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

271

.$GPL_FILE_NUM%>%

271

.$GPL_FILE_NUM%>%

272

grep(gplnum,.)

272

grep(gplnum,.)

273

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

273

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

274

.$LOC_ID %>%

274

.$LOC_ID %>%

275

.[IDLOCAL]

275

.[IDLOCAL]

276

geneIDNam <- genena %>%

276

geneIDNam <- genena %>%

277

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

277

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

278

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

278

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

279

}

279

}

280

else if(IDF == 0){

280

else if(IDF == 0){

281

#No information on this particular GPL file

281

#No information on this particular GPL file

282

idLOCGPL <- genena %>%

282

idLOCGPL <- genena %>%

283

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

283

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

284

t(.) %>%

284

t(.) %>%

285

grep("^ID\\s*$",.) %>%

285

grep("^ID\\s*$",.) %>%

286

-1

286

-1

287

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

287

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

288

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

288

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

289

geneIDNam <- genena %>%

289

geneIDNam <- genena %>%

290

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

290

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

291

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

291

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

292

}

292

}

293

}

293

}

294

else if(fileex == 0){

294

else if(fileex == 0){

295

#We must create a file that we can access for later use

295

#We must create a file that we can access for later use

296

idLOCGPL <- genena %>%

296

idLOCGPL <- genena %>%

297

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

297

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

298

t(.) %>%

298

t(.) %>%

299

grep("^ID\\s*$",.) %>%

299

grep("^ID\\s*$",.) %>%

300

-1

300

-1

301

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

301

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

302

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

302

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

303

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

303

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

304

geneIDNam <- genena %>%

304

geneIDNam <- genena %>%

305

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

305

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

306

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

306

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

307

}

307

}

308

}

308

}

309

else if(soft == FALSE){

309

else if(soft == FALSE){

310

geneIDNam <- genena %>%

310

geneIDNam <- genena %>%

311

read_delim(delim="\t",comment = "#")%>%

311

read_delim(delim="\t",comment = "#")%>%

312

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

312

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

313

}

313

}

314

315

##Labeling the gene IDs without names

315

##Labeling the gene IDs without names

316

geneIDNam <- NAFIXING(geneIDNam)

316

geneIDNam <- NAFIXING(geneIDNam)

317

318

##remove the whitespace

318

##remove the whitespace

319

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

319

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

320

321

##Here is the clean version

321

##Here is the clean version

322

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

322

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

323

}

323

}

324

325

326

327

##Changing the gene ID to gene name

327

##Changing the gene ID to gene name

328

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

328

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

329

colnames(ALZDAT) = ALZDAT1[1,]

329

colnames(ALZDAT) = ALZDAT1[1,]

330

331

332

##Adjusting the column names aka the gene names

332

##Adjusting the column names aka the gene names

333

colnames(ALZDAT) <- gcnames(ALZDAT)

333

colnames(ALZDAT) <- gcnames(ALZDAT)

334

335

336

#Full RAW Data

336

#Full RAW Data

337

Fullalzdwr <- ALZDAT %>%

337

Fullalzdwr <- ALZDAT %>%

338

as.data.frame() %>%

338

as.data.frame() %>%

339

cbind(ALZWORDF,.)

339

cbind(ALZWORDF,.)

340

341

#Raw file is output

341

#Raw file is output

342

nfnaex <- strsplit(alz,"[\\]") %>%

342

nfnaex <- strsplit(alz,"[\\]") %>%

343

.[[1]] %>%

343

.[[1]] %>%

344

.[length(.)] %>%

344

.[length(.)] %>%

345

gsub("\\D","",.) %>%

345

gsub("\\D","",.) %>%

346

c("GSE",.,"aftexcel.txt") %>%

346

c("GSE",.,"aftexcel.txt") %>%

347

paste(collapse = "")

347

paste(collapse = "")

348

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

348

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

349

350

351

352

#Now for the discretization part

352

#Now for the discretization part

353

##get the wordy part again

353

##get the wordy part again

354

rawword <- t(ALZWORDF)

354

rawword <- t(ALZWORDF)

355

356

##where is ID_REF located

356

##where is ID_REF located

357

hereim <- grep("ID_REF",rownames(rawword))

357

hereim <- grep("ID_REF",rownames(rawword))

358

359

##Subject Names GSM...

359

##Subject Names GSM...

360

subjnam <- rawword[hereim,]

360

subjnam <- rawword[hereim,]

361

362

##Getting the names for the rows

362

##Getting the names for the rows

363

namedarows <- rownames(rawword)[-hereim] %>%

363

namedarows <- rownames(rawword)[-hereim] %>%

364

as.data.frame()

364

as.data.frame()

365

RAWWORD <- rawword[-hereim,] %>%

365

RAWWORD <- rawword[-hereim,] %>%

366

as.data.frame() %>%

366

as.data.frame() %>%

367

bind_cols(namedarows,.)

367

bind_cols(namedarows,.)

368

z <- 1

368

z <- 1

369

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

369

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

370

for(z in 1:dim(RAWWORD)[1]){

370

for(z in 1:dim(RAWWORD)[1]){

371

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

371

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

372

z <- z + 1

372

z <- z + 1

373

}

373

}

374

375

colnames(naroww) <- "ROW_NAs"

375

colnames(naroww) <- "ROW_NAs"

376

RAWWORD <- bind_cols(RAWWORD,naroww)

376

RAWWORD <- bind_cols(RAWWORD,naroww)

377

378

379

roALZna <- t(ALZDAT) %>%

379

roALZna <- t(ALZDAT) %>%

380

rownames(.) %>%

380

rownames(.) %>%

381

as.data.frame(.)

381

as.data.frame(.)

382

colnames(roALZna) <- "ID_REF"

382

colnames(roALZna) <- "ID_REF"

383

384

RAWDAT <- t(ALZDAT) %>%

384

RAWDAT <- t(ALZDAT) %>%

385

as.data.frame(.)

385

as.data.frame(.)

386

colnames(RAWDAT) <- NULL

386

colnames(RAWDAT) <- NULL

387

rownames(RAWDAT) <- NULL

387

rownames(RAWDAT) <- NULL

388

389

RAWDAT2 <- RAWDAT %>%

389

RAWDAT2 <- RAWDAT %>%

390

cbind(roALZna,.) %>%

390

cbind(roALZna,.) %>%

391

dplyr::arrange(.,ID_REF)

391

dplyr::arrange(.,ID_REF)

392

393

##Editing the file for R processing

393

##Editing the file for R processing

394

RAWDATID <- RAWDAT2[,1] %>%

394

RAWDATID <- RAWDAT2[,1] %>%

395

as.matrix(.)

395

as.matrix(.)

396

397

RAWDATNUM <- RAWDAT2[,-1] %>%

397

RAWDATNUM <- RAWDAT2[,-1] %>%

398

mapply(.,FUN = as.numeric) %>%

398

mapply(.,FUN = as.numeric) %>%

399

t(.)

399

t(.)

400

401

##Consolidating genes with the same name

401

##Consolidating genes with the same name

402

###create empty matrix of size equal to tabRDATID

402

###create empty matrix of size equal to tabRDATID

403

tabRDATID <- table(RAWDATID)

403

tabRDATID <- table(RAWDATID)

404

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

404

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

405

j <- 1

405

j <- 1

406

for(j in 1:length(tabRDATID)){

406

for(j in 1:length(tabRDATID)){

407

##Putting the ones without duplicates in their new homes

407

##Putting the ones without duplicates in their new homes

408

if(tabRDATID[j] == 1){

408

if(tabRDATID[j] == 1){

409

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

409

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

410

}

410

}

411

##Averaging duplicates and putting them in their new homes

411

##Averaging duplicates and putting them in their new homes

412

else if(tabRDATID[j] > 1){

412

else if(tabRDATID[j] > 1){

413

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

413

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

414

}

414

}

415

j <- j + 1

415

j <- j + 1

416

}

416

}

417

418

##Scaling the Data

418

##Scaling the Data

419

scrawdat <- NuRDATN%>%

419

scrawdat <- NuRDATN%>%

420

scale()

420

scale()

421

attr(scrawdat,"scaled:center") <- NULL

421

attr(scrawdat,"scaled:center") <- NULL

422

attr(scrawdat,"scaled:scale") <- NULL

422

attr(scrawdat,"scaled:scale") <- NULL

423

colnames(scrawdat) <- rownames(tabRDATID)

423

colnames(scrawdat) <- rownames(tabRDATID)

424

425

##Discretized the Data

425

##Discretized the Data

426

dialzdat <- scrawdat %>%

426

dialzdat <- scrawdat %>%

427

dndat(.) %>%

427

dndat(.) %>%

428

t()%>%

428

t()%>%

429

as.data.frame(.)

429

as.data.frame(.)

430

colnames(dialzdat) <- rownames(RAWDATNUM)

430

colnames(dialzdat) <- rownames(RAWDATNUM)

431

432

##setting "ID_REF" as a new variable

432

##setting "ID_REF" as a new variable

433

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

433

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

434

colnames(geneNAM) <- "ID_REF"

434

colnames(geneNAM) <- "ID_REF"

435

rownames(dialzdat) <- NULL

435

rownames(dialzdat) <- NULL

436

dialzdat <-bind_cols(geneNAM,dialzdat)

436

dialzdat <-bind_cols(geneNAM,dialzdat)

437

438

##NAs in a column

438

##NAs in a column

439

x <- 2

439

x <- 2

440

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

440

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

441

nacol[1,1] = "COL_NAs"

441

nacol[1,1] = "COL_NAs"

442

for(x in 2:dim(dialzdat)[2]){

442

for(x in 2:dim(dialzdat)[2]){

443

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

443

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

444

x <- x + 1

444

x <- x + 1

445

}

445

}

446

colnames(nacol) <- colnames(dialzdat)

446

colnames(nacol) <- colnames(dialzdat)

447

dialzdat <- bind_rows(dialzdat,nacol)

447

dialzdat <- bind_rows(dialzdat,nacol)

448

449

##NAs in a row

449

##NAs in a row

450

y <- 1

450

y <- 1

451

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

451

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

452

for(y in 1:dim(dialzdat)[1]){

452

for(y in 1:dim(dialzdat)[1]){

453

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

453

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

454

y <- y + 1

454

y <- y + 1

455

}

455

}

456

colnames(narowd) <- "ROW_NAs"

456

colnames(narowd) <- "ROW_NAs"

457

dialzdat <- bind_cols(dialzdat,narowd)

457

dialzdat <- bind_cols(dialzdat,narowd)

458

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

458

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

459

colnames(RAWWORD) <- colnames(dialzdat)

459

colnames(RAWWORD) <- colnames(dialzdat)

460

##converting to character so that the clinical can be brought together with discrete data

460

##converting to character so that the clinical can be brought together with discrete data

461

k <- 2

461

k <- 2

462

for(k in 2:dim(dialzdat)[2]-1){

462

for(k in 2:dim(dialzdat)[2]-1){

463

dialzdat[,k] <- as.character(dialzdat[,k])

463

dialzdat[,k] <- as.character(dialzdat[,k])

464

k <- k + 1

464

k <- k + 1

465

}

465

}

466

#The End the full data

466

#The End the full data

467

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

467

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

468

469

#Produces Discrete file

469

#Produces Discrete file

470

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

470

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

471

.[[1]] %>%

471

.[[1]] %>%

472

.[length(.)] %>%

472

.[length(.)] %>%

473

gsub("\\D","",.) %>%

473

gsub("\\D","",.) %>%

474

c("GSE",.,"dscrt.txt") %>%

474

c("GSE",.,"dscrt.txt") %>%

475

paste(collapse = "")

475

paste(collapse = "")

476

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

476

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

477

n <- n +1

477

n <- n +1

478

}

478

}

479

}

479

}

480

481

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

481

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

482

else if(numDAT == 2){

482

else if(numDAT == 2){

483

#All the files you want to analyze

483

#All the files you want to analyze

484

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

484

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

485

if(length(ANDIS) == 0){

485

if(length(ANDIS) == 0){

486

#Spit out a warning

486

#Spit out a warning

487

warning("You did not select any files and so no cleaning will be performed")

487

warning("You did not select any files and so no cleaning will be performed")

488

} else{

488

} else{

489

#indexing the data files

489

#indexing the data files

490

n <- 1

490

n <- 1

491

for(n in 1: length(ANDIS)){

491

for(n in 1: length(ANDIS)){

492

alz <- ANDIS[n]

492

alz <- ANDIS[n]

493

494

#Working with the wordy part of the document

494

#Working with the wordy part of the document

495

alzword <- alz %>%

495

alzword <- alz %>%

496

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

496

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

497

filter(grepl("!Sample",X1))%>%

497

filter(grepl("!Sample",X1))%>%

498

filter(!grepl("!Sample_contact",X1))

498

filter(!grepl("!Sample_contact",X1))

499

500

#Getting the GPL file

500

#Getting the GPL file

501

genena <- grep("_platform_id",alzword$X1) %>%

501

genena <- grep("_platform_id",alzword$X1) %>%

502

alzword$X2[.] %>%

502

alzword$X2[.] %>%

503

str_trim(.) %>%

503

str_trim(.) %>%

504

paste0("^",.,"\\D") %>%

504

paste0("^",.,"\\D") %>%

505

grep(.,list.files()) %>%

505

grep(.,list.files()) %>%

506

list.files()[.]

506

list.files()[.]

507

508

#Find out if it is a soft GPL file or not

508

#Find out if it is a soft GPL file or not

509

soft <- strsplit(genena,"[\\|/]") %>%

509

soft <- strsplit(genena,"[\\|/]") %>%

510

.[[1]] %>%

510

.[[1]] %>%

511

.[length(.)] %>%

511

.[length(.)] %>%

512

grepl("soft",.)

512

grepl("soft",.)

513

514

##Changing row names and column names:

514

##Changing row names and column names:

515

ALZWORD <- t(alzword)

515

ALZWORD <- t(alzword)

516

rownames(ALZWORD)=NULL

516

rownames(ALZWORD)=NULL

517

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

517

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

518

ALZWORD <- chngrownm(ALZWORD)[-1,]

518

ALZWORD <- chngrownm(ALZWORD)[-1,]

519

ALZWORD <- ALZWORD%>%

519

ALZWORD <- ALZWORD%>%

520

as.data.frame()%>%

520

as.data.frame()%>%

521

dplyr::select(-starts_with("col"))

521

dplyr::select(-starts_with("col"))

522

523

##Reorganizing information within the columns and final clinical data

523

##Reorganizing information within the columns and final clinical data

524

ALZWORDF <- cinfo(ALZWORD)

524

ALZWORDF <- cinfo(ALZWORD)

525

526

527

#Working with Actual Data part of file

527

#Working with Actual Data part of file

528

alzdat <- alz %>%

528

alzdat <- alz %>%

529

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

529

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

530

ALZDAT <- t(alzdat[,-1])

530

ALZDAT <- t(alzdat[,-1])

531

rownames(ALZDAT)=NULL

531

rownames(ALZDAT)=NULL

532

533

##Is there a clean version of the GPL file available?

533

##Is there a clean version of the GPL file available?

534

gplnum <- strsplit(genena,"[\\|/]") %>%

534

gplnum <- strsplit(genena,"[\\|/]") %>%

535

.[[1]] %>%

535

.[[1]] %>%

536

.[length(.)] %>%

536

.[length(.)] %>%

537

gsub("\\D","",.)

537

gsub("\\D","",.)

538

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

538

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

539

if(clfileex >= 1){

539

if(clfileex >= 1){

540

#use the clean version

540

#use the clean version

541

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

541

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

542

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

542

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

543

544

}

544

}

545

else if(clfileex == 0){

545

else if(clfileex == 0){

546

##Lets Create a clean version

546

##Lets Create a clean version

547

548

##Gene ID to Gene Name

548

##Gene ID to Gene Name

549

if(soft == TRUE){

549

if(soft == TRUE){

550

#Check to see if there is already a file containing information on soft files

550

#Check to see if there is already a file containing information on soft files

551

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

551

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

552

if(fileex == 1){

552

if(fileex == 1){

553

#Check to see if this GPL soft file has been used before

553

#Check to see if this GPL soft file has been used before

554

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

554

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

555

.$GPL_FILE_NUM%>%

555

.$GPL_FILE_NUM%>%

556

grepl(gplnum,.) %>%

556

grepl(gplnum,.) %>%

557

sum()

557

sum()

558

if(IDF == 1){

558

if(IDF == 1){

559

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

559

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

560

.$GPL_FILE_NUM%>%

560

.$GPL_FILE_NUM%>%

561

grep(gplnum,.)

561

grep(gplnum,.)

562

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

562

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

563

.$LOC_ID %>%

563

.$LOC_ID %>%

564

.[IDLOCAL]

564

.[IDLOCAL]

565

geneIDNam <- genena %>%

565

geneIDNam <- genena %>%

566

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

566

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

567

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

567

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

568

}

568

}

569

else if(IDF == 0){

569

else if(IDF == 0){

570

#No information on this particular GPL file

570

#No information on this particular GPL file

571

idLOCGPL <- genena %>%

571

idLOCGPL <- genena %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

573

t(.) %>%

573

t(.) %>%

574

grep("^ID\\s*$",.) %>%

574

grep("^ID\\s*$",.) %>%

575

-1

575

-1

576

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

576

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

577

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

577

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

578

geneIDNam <- genena %>%

578

geneIDNam <- genena %>%

579

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

579

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

580

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

580

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

581

}

581

}

582

}

582

}

583

else if(fileex == 0){

583

else if(fileex == 0){

584

#We must create a file that we can access for later use

584

#We must create a file that we can access for later use

585

idLOCGPL <- genena %>%

585

idLOCGPL <- genena %>%

586

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

586

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

587

t(.) %>%

587

t(.) %>%

588

grep("^ID\\s*$",.) %>%

588

grep("^ID\\s*$",.) %>%

589

-1

589

-1

590

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

590

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

591

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

591

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

592

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

592

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

593

geneIDNam <- genena %>%

593

geneIDNam <- genena %>%

594

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

594

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

595

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

595

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

596

}

596

}

597

}

597

}

598

else if(soft == FALSE){

598

else if(soft == FALSE){

599

geneIDNam <- genena %>%

599

geneIDNam <- genena %>%

600

read_delim(delim="\t",comment = "#")%>%

600

read_delim(delim="\t",comment = "#")%>%

601

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

601

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

602

}

602

}

603

604

##Labeling the gene IDs without names

604

##Labeling the gene IDs without names

605

geneIDNam <- NAFIXING(geneIDNam)

605

geneIDNam <- NAFIXING(geneIDNam)

606

607

##remove the whitespace

607

##remove the whitespace

608

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

608

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

609

610

##Here is the clean version

610

##Here is the clean version

611

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

611

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

612

}

612

}

613

614

615

616

##Changing the gene ID to gene name

616

##Changing the gene ID to gene name

617

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

617

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

618

colnames(ALZDAT) = ALZDAT1[1,]

618

colnames(ALZDAT) = ALZDAT1[1,]

619

620

621

##Adjusting the column names aka the gene names

621

##Adjusting the column names aka the gene names

622

colnames(ALZDAT) <- gcnames(ALZDAT)

622

colnames(ALZDAT) <- gcnames(ALZDAT)

623

624

625

#Full RAW Data

625

#Full RAW Data

626

Fullalzdwr <- ALZDAT %>%

626

Fullalzdwr <- ALZDAT %>%

627

as.data.frame() %>%

627

as.data.frame() %>%

628

cbind(ALZWORDF,.)

628

cbind(ALZWORDF,.)

629

630

#Raw file is output

630

#Raw file is output

631

nfnaex <- strsplit(alz,"[\\]") %>%

631

nfnaex <- strsplit(alz,"[\\]") %>%

632

.[[1]] %>%

632

.[[1]] %>%

633

.[length(.)] %>%

633

.[length(.)] %>%

634

gsub("\\D","",.) %>%

634

gsub("\\D","",.) %>%

635

c("GSE",.,"aftexcel.txt") %>%

635

c("GSE",.,"aftexcel.txt") %>%

636

paste(collapse = "")

636

paste(collapse = "")

637

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

637

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

638

639

640

641

#Now for the discretization part

641

#Now for the discretization part

642

##get the wordy part again

642

##get the wordy part again

643

rawword <- t(ALZWORDF)

643

rawword <- t(ALZWORDF)

644

645

##where is ID_REF located

645

##where is ID_REF located

646

hereim <- grep("ID_REF",rownames(rawword))

646

hereim <- grep("ID_REF",rownames(rawword))

647

648

##Subject Names GSM...

648

##Subject Names GSM...

649

subjnam <- rawword[hereim,]

649

subjnam <- rawword[hereim,]

650

651

##Getting the names for the rows

651

##Getting the names for the rows

652

namedarows <- rownames(rawword)[-hereim] %>%

652

namedarows <- rownames(rawword)[-hereim] %>%

653

as.data.frame()

653

as.data.frame()

654

RAWWORD <- rawword[-hereim,] %>%

654

RAWWORD <- rawword[-hereim,] %>%

655

as.data.frame() %>%

655

as.data.frame() %>%

656

bind_cols(namedarows,.)

656

bind_cols(namedarows,.)

657

z <- 1

657

z <- 1

658

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

658

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

659

for(z in 1:dim(RAWWORD)[1]){

659

for(z in 1:dim(RAWWORD)[1]){

660

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

660

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

661

z <- z + 1

661

z <- z + 1

662

}

662

}

663

664

colnames(naroww) <- "ROW_NAs"

664

colnames(naroww) <- "ROW_NAs"

665

RAWWORD <- bind_cols(RAWWORD,naroww)

665

RAWWORD <- bind_cols(RAWWORD,naroww)

666

667

668

roALZna <- t(ALZDAT) %>%

668

roALZna <- t(ALZDAT) %>%

669

rownames(.) %>%

669

rownames(.) %>%

670

as.data.frame(.)

670

as.data.frame(.)

671

colnames(roALZna) <- "ID_REF"

671

colnames(roALZna) <- "ID_REF"

672

673

RAWDAT <- t(ALZDAT) %>%

673

RAWDAT <- t(ALZDAT) %>%

674

as.data.frame(.)

674

as.data.frame(.)

675

colnames(RAWDAT) <- NULL

675

colnames(RAWDAT) <- NULL

676

rownames(RAWDAT) <- NULL

676

rownames(RAWDAT) <- NULL

677

678

RAWDAT2 <- RAWDAT %>%

678

RAWDAT2 <- RAWDAT %>%

679

cbind(roALZna,.) %>%

679

cbind(roALZna,.) %>%

680

dplyr::arrange(.,ID_REF)

680

dplyr::arrange(.,ID_REF)

681

682

##Editing the file for R processing

682

##Editing the file for R processing

683

RAWDATID <- RAWDAT2[,1] %>%

683

RAWDATID <- RAWDAT2[,1] %>%

684

as.matrix(.)

684

as.matrix(.)

685

686

RAWDATNUM <- RAWDAT2[,-1] %>%

686

RAWDATNUM <- RAWDAT2[,-1] %>%

687

mapply(.,FUN = as.numeric) %>%

687

mapply(.,FUN = as.numeric) %>%

688

t(.)

688

t(.)

689

690

##Consolidating genes with the same name

690

##Consolidating genes with the same name

691

###create empty matrix of size equal to tabRDATID

691

###create empty matrix of size equal to tabRDATID

692

tabRDATID <- table(RAWDATID)

692

tabRDATID <- table(RAWDATID)

693

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

693

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

694

j <- 1

694

j <- 1

695

for(j in 1:length(tabRDATID)){

695

for(j in 1:length(tabRDATID)){

696

##Putting the ones without duplicates in their new homes

696

##Putting the ones without duplicates in their new homes

697

if(tabRDATID[j] == 1){

697

if(tabRDATID[j] == 1){

698

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

698

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

699

}

699

}

700

##Averaging duplicates and putting them in their new homes

700

##Averaging duplicates and putting them in their new homes

701

else if(tabRDATID[j] > 1){

701

else if(tabRDATID[j] > 1){

702

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

702

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

703

}

703

}

704

j <- j + 1

704

j <- j + 1

705

}

705

}

706

707

##Scaling the Data

707

##Scaling the Data

708

scrawdat <- NuRDATN%>%

708

scrawdat <- NuRDATN%>%

709

scale()

709

scale()

710

attr(scrawdat,"scaled:center") <- NULL

710

attr(scrawdat,"scaled:center") <- NULL

711

attr(scrawdat,"scaled:scale") <- NULL

711

attr(scrawdat,"scaled:scale") <- NULL

712

colnames(scrawdat) <- rownames(tabRDATID)

712

colnames(scrawdat) <- rownames(tabRDATID)

713

714

##Discretized the Data

714

##Discretized the Data

715

dialzdat <- scrawdat %>%

715

dialzdat <- scrawdat %>%

716

dndat(.) %>%

716

dndat(.) %>%

717

t()%>%

717

t()%>%

718

as.data.frame(.)

718

as.data.frame(.)

719

colnames(dialzdat) <- rownames(RAWDATNUM)

719

colnames(dialzdat) <- rownames(RAWDATNUM)

720

721

##setting "ID_REF" as a new variable

721

##setting "ID_REF" as a new variable

722

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

722

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

723

colnames(geneNAM) <- "ID_REF"

723

colnames(geneNAM) <- "ID_REF"

724

rownames(dialzdat) <- NULL

724

rownames(dialzdat) <- NULL

725

dialzdat <-bind_cols(geneNAM,dialzdat)

725

dialzdat <-bind_cols(geneNAM,dialzdat)

726

727

##NAs in a column

727

##NAs in a column

728

x <- 2

728

x <- 2

729

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

729

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

730

nacol[1,1] = "COL_NAs"

730

nacol[1,1] = "COL_NAs"

731

for(x in 2:dim(dialzdat)[2]){

731

for(x in 2:dim(dialzdat)[2]){

732

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

732

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

733

x <- x + 1

733

x <- x + 1

734

}

734

}

735

colnames(nacol) <- colnames(dialzdat)

735

colnames(nacol) <- colnames(dialzdat)

736

dialzdat <- bind_rows(dialzdat,nacol)

736

dialzdat <- bind_rows(dialzdat,nacol)

737

738

##NAs in a row

738

##NAs in a row

739

y <- 1

739

y <- 1

740

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

740

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

741

for(y in 1:dim(dialzdat)[1]){

741

for(y in 1:dim(dialzdat)[1]){

742

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

742

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

743

y <- y + 1

743

y <- y + 1

744

}

744

}

745

colnames(narowd) <- "ROW_NAs"

745

colnames(narowd) <- "ROW_NAs"

746

dialzdat <- bind_cols(dialzdat,narowd)

746

dialzdat <- bind_cols(dialzdat,narowd)

747

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

747

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

748

colnames(RAWWORD) <- colnames(dialzdat)

748

colnames(RAWWORD) <- colnames(dialzdat)

749

##converting to character so that the clinical can be brought together with discrete data

749

##converting to character so that the clinical can be brought together with discrete data

750

k <- 2

750

k <- 2

751

for(k in 2:dim(dialzdat)[2]-1){

751

for(k in 2:dim(dialzdat)[2]-1){

752

dialzdat[,k] <- as.character(dialzdat[,k])

752

dialzdat[,k] <- as.character(dialzdat[,k])

753

k <- k + 1

753

k <- k + 1

754

}

754

}

755

#The End the full data

755

#The End the full data

756

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

756

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

757

758

#Produces Discrete file

758

#Produces Discrete file

759

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

759

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

760

.[[1]] %>%

760

.[[1]] %>%

761

.[length(.)] %>%

761

.[length(.)] %>%

762

gsub("\\D","",.) %>%

762

gsub("\\D","",.) %>%

763

c("GSE",.,"dscrt.txt") %>%

763

c("GSE",.,"dscrt.txt") %>%

764

paste(collapse = "")

764

paste(collapse = "")

765

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

765

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

766

767

768

n <- n + 1

768

n <- n + 1

769

}

769

}

770

}

770

}

771

}

771

}

772

}

772

}

773

#The Rest of this code will be used every time you want to change a data set

773

#The Rest of this code will be used every time you want to change a data set

774

THEFT()

774

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Don't use this code yet