Efrain Gonzalez / Cleaning and Fixing Data with R

1

########################################################################

1

########################################################################

2

# Don't Use This Code Just Yet #

2

# Don't Use This Code Just Yet #

3

########################################################################

3

########################################################################

4

#Efrain H. Gonzalez

4

#Efrain H. Gonzalez

5

#6/16/2017

5

#6/16/2017

6

7

#Libraries required to run the code

7

#Libraries required to run the code

8

library(pryr)

8

library(pryr)

9

library(MASS)

9

library(MASS)

10

library(dplyr)

10

library(dplyr)

11

library(tidyr)

11

library(tidyr)

12

library(readr)

12

library(readr)

13

library(stringr)

13

library(stringr)

14

15

16

#Necessary Functions

16

#Necessary Functions

17

#1#Function for handling the changing of row names and column names

17

#1#Function for handling the changing of row names and column names

18

chngrownm <- function(mat){

18

chngrownm <- function(mat){

19

row <- dim(mat)[1]

19

row <- dim(mat)[1]

20

col <- dim(mat)[2]

20

col <- dim(mat)[2]

21

e <- 1

21

e <- 1

22

r <- 1

22

r <- 1

23

a <- 1

23

a <- 1

24

h <- 1

24

h <- 1

25

g <- 1

25

g <- 1

26

o <- 1

26

o <- 1

27

for(e in 1:col){

27

for(e in 1:col){

28

if("!Sample_source_name_ch1"==mat[1,e]){

28

if("!Sample_source_name_ch1"==mat[1,e]){

29

colnames(mat)[e] <- "Brain_Region"

29

colnames(mat)[e] <- "Brain_Region"

30

}

30

} else if("!Sample_title" == mat[1,e]){

31

else if("!Sample_title" == mat[1,e]){

32

colnames(mat)[e] <- "Title"

31

colnames(mat)[e] <- "Title"

33

}

32

} else if("!Sample_geo_accession" == mat[1,e]){

34

else if("!Sample_geo_accession" == mat[1,e]){

35

colnames(mat)[e] <- "ID_REF"

33

colnames(mat)[e] <- "ID_REF"

36

} else{

34

} else{

37

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

35

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

38

colnames(mat)[e] <- paste0("Sex",r)

36

colnames(mat)[e] <- paste0("Sex",r)

39

r = r + 1

37

r = r + 1

40

}

38

}

41

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

39

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

42

colnames(mat)[e] <- paste0("PMI",a)

40

colnames(mat)[e] <- paste0("PMI",a)

43

a = a + 1

41

a = a + 1

44

}

42

}

45

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

43

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

46

colnames(mat)[e] <- paste0("Age",h)

44

colnames(mat)[e] <- paste0("Age",h)

47

h = h + 1

45

h = h + 1

48

}

46

}

49

if(grepl("braak|b&b",mat[2,e])==TRUE){

47

if(grepl("braak|b&b",mat[2,e])==TRUE){

50

colnames(mat)[e] <- paste0("Braak",g)

48

colnames(mat)[e] <- paste0("Braak",g)

51

g = g + 1

49

g = g + 1

52

}

50

}

53

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

51

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

54

colnames(mat)[e] <- paste0("Group",o)

52

colnames(mat)[e] <- paste0("Group",o)

55

o = o + 1

53

o = o + 1

56

}

54

}

57

55

58

}

56

}

59

e = e + 1

57

e = e + 1

60

}

58

}

61

mat

59

mat

62

}

60

}

63

61

64

#2#Function for reorganizing information within the columns

62

#2#Function for reorganizing information within the columns

65

cinfo <- function(mat){

63

cinfo <- function(mat){

66

col <- dim(mat)[2]

64

col <- dim(mat)[2]

67

j <-2

65

j <-2

68

for(j in 2:col){

66

for(j in 2:col){

69

if(grepl("Group",colnames(mat)[j]) == TRUE){

67

if(grepl("Group",colnames(mat)[j]) == TRUE){

70

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

68

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

71

}

69

} else if(grepl("Age",colnames(mat)[j])==TRUE){

72

else if(grepl("Age",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub("\\D","",mat[,j])%>%

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

74

as.integer()

71

as.integer()

75

}

72

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

76

else if(grepl("Sex",colnames(mat)[j])==TRUE){

77

mat[,j] <- gsub(".+:\\s","",mat[,j])

73

mat[,j] <- gsub(".+:\\s","",mat[,j])

78

}

74

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

79

else if(grepl("PMI",colnames(mat)[j])==TRUE){

80

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

81

as.numeric()

76

as.numeric()

82

}

77

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

83

else if(grepl("Braak",colnames(mat)[j])==TRUE){

84

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

85

as.roman()%>%

79

as.roman()%>%

86

as.integer()

80

as.integer()

87

}

81

}

88

j=j+1

82

j=j+1

89

}

83

}

90

mat

84

mat

91

}

85

}

92

86

93

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

94

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

95

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

96

i <- 1

90

i <- 1

97

for(i in 1:row){

91

for(i in 1:row){

98

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

99

GIDNAM[i,2] <- GIDNAM[i,1]

93

GIDNAM[i,2] <- GIDNAM[i,1]

100

}

94

}

101

i <- i + 1

95

i <- i + 1

102

}

96

}

103

GIDNAM

97

GIDNAM

104

}

98

}

105

99

106

#4#Function for changing the gene ID to gene name

100

#4#Function for changing the gene ID to gene name

107

cgeneID <- function(GeneName,DATA){

101

cgeneID <- function(GeneName,DATA){

108

nj <- t(GeneName)

102

nj <- t(GeneName)

109

nq <- t(DATA)

103

nq <- t(DATA)

110

colGene <- dim(nj)[2]

104

colGene <- dim(nj)[2]

111

colDATA <- dim(nq)[2]

105

colDATA <- dim(nq)[2]

112

j <- 1

106

j <- 1

113

for(j in 1:colDATA){

107

for(j in 1:colDATA){

114

#where is that gene id located within the GPL file

108

#where is that gene id located within the GPL file

115

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

109

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

116

if(is.na(sum(chngreq))==FALSE){

110

if(is.na(sum(chngreq))==FALSE){

117

if(sum(chngreq) > 0){

111

if(sum(chngreq) > 0){

118

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

112

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

119

}

113

}

120

}

114

}

121

j <- j + 1

115

j <- j + 1

122

}

116

}

123

nq

117

nq

124

}

118

}

125

#cgeneID <- function(GeneName,DATA){

119

#cgeneID <- function(GeneName,DATA){

126

# colGene <- dim(GeneName)[2]

120

# colGene <- dim(GeneName)[2]

127

# j <- 1

121

# j <- 1

128

# for(j in 1:colGene){

122

# for(j in 1:colGene){

129

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

123

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

130

# if(is.na(sum(chngsreq))==FALSE){

124

# if(is.na(sum(chngsreq))==FALSE){

131

# if(sum(chngsreq) > 0){

125

# if(sum(chngsreq) > 0){

132

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

126

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# }

127

# }

134

# }

128

# }

135

# j = j+1

129

# j = j+1

136

# }

130

# }

137

# DATA

131

# DATA

138

#}

132

#}

139

133

140

#5#Function for adjusting the gene names

134

#5#Function for adjusting the gene names

141

gcnames <- function(DiData,usecol=1){

135

gcnames <- function(DiData,usecol=1){

142

nuruns <- dim(DiData)[2]

136

nuruns <- dim(DiData)[2]

143

i = 1

137

i = 1

144

nwnam <- rep("0",length.out=nuruns)

138

nwnam <- rep("0",length.out=nuruns)

145

for(i in 1:nuruns){

139

for(i in 1:nuruns){

146

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

147

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

148

} else{

142

} else{

149

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

143

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

150

}

144

}

151

145

152

}

146

}

153

nwnam

147

nwnam

154

148

155

}

149

}

156

150

157

#6# Function for discretizing the data

151

#6# Function for discretizing the data

158

dndat <- function(NDATA){

152

dndat <- function(NDATA){

159

rownd <- dim(NDATA)[1]

153

rownd <- dim(NDATA)[1]

160

colnd <- dim(NDATA)[2]

154

colnd <- dim(NDATA)[2]

161

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

155

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

162

colnames(DDATA) <- colnames(NDATA)

156

colnames(DDATA) <- colnames(NDATA)

163

i <- 1

157

i <- 1

164

for(i in 1:rownd){

158

for(i in 1:rownd){

165

j <- 1

159

j <- 1

166

for(j in 1:colnd){

160

for(j in 1:colnd){

167

if(is.na(NDATA[i,j])==FALSE){

161

if(is.na(NDATA[i,j])==FALSE){

168

162

169

if(NDATA[i,j] < -1){

163

if(NDATA[i,j] < -1){

170

DDATA[i,j]=0L

164

DDATA[i,j]=0L

171

}

165

} else if(NDATA[i,j] > 1){

172

else if(NDATA[i,j] > 1){

173

DDATA[i,j]=2L

166

DDATA[i,j]=2L

174

}

167

} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

175

else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

176

DDATA[i,j]=1L

168

DDATA[i,j]=1L

177

}

169

}

178

} else{

170

} else{

179

DDATA[i,j] = NDATA[i,j]

171

DDATA[i,j] = NDATA[i,j]

180

}

172

}

181

j = j + 1

173

j = j + 1

182

}

174

}

183

i = i + 1

175

i = i + 1

184

}

176

}

185

DDATA

177

DDATA

186

}

178

}

187

179

188

180

189

#MajorFunction#This is the function that does everything else

181

#MajorFunction#This is the function that does everything else

190

THEFT <- function(){

182

THEFT <- function(){

191

#Set working directory based on the directory of the series matrix file Currently only works for windows

183

#Set working directory based on the directory of the series matrix file Currently only works for windows

192

wd <- getwd()

184

wd <- getwd()

193

#list.files()

185

#list.files()

194

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

186

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

195

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

187

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

196

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

188

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

197

GSEfloc <- list.files()[GSEfileloc]

189

GSEfloc <- list.files()[GSEfileloc]

198

#ALL DATA FILES WILL BE CLEANED

190

#ALL DATA FILES WILL BE CLEANED

199

if(numDAT == 1){

191

if(numDAT == 1){

200

#indexing the data files

192

#indexing the data files

201

n <- 1

193

n <- 1

202

for(n in 1: length(GSEfloc)){

194

for(n in 1: length(GSEfloc)){

203

alz <- GSEfloc[n]

195

alz <- GSEfloc[n]

204

196

205

#Working with the wordy part of the document

197

#Working with the wordy part of the document

206

alzword <- alz %>%

198

alzword <- alz %>%

207

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

199

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

208

filter(grepl("!Sample",X1))%>%

200

filter(grepl("!Sample",X1))%>%

209

filter(!grepl("!Sample_contact",X1))

201

filter(!grepl("!Sample_contact",X1))

210

202

211

#Getting the GPL file

203

#Getting the GPL file

212

genena <- grep("_platform_id",alzword$X1) %>%

204

genena <- grep("_platform_id",alzword$X1) %>%

213

alzword$X2[.] %>%

205

alzword$X2[.] %>%

214

str_trim(.) %>%

206

str_trim(.) %>%

215

paste0("^",.,"\\D") %>%

207

paste0("^",.,"\\D") %>%

216

grep(.,list.files()) %>%

208

grep(.,list.files()) %>%

217

list.files()[.]

209

list.files()[.]

218

210

219

#Find out if it is a soft GPL file or not

211

#Find out if it is a soft GPL file or not

220

soft <- strsplit(genena,"[\\|/]") %>%

212

soft <- strsplit(genena,"[\\|/]") %>%

221

.[[1]] %>%

213

.[[1]] %>%

222

.[length(.)] %>%

214

.[length(.)] %>%

223

grepl("soft",.)

215

grepl("soft",.)

224

216

225

##Changing row names and column names:

217

##Changing row names and column names:

226

ALZWORD <- t(alzword)

218

ALZWORD <- t(alzword)

227

rownames(ALZWORD)=NULL

219

rownames(ALZWORD)=NULL

228

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

220

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

229

ALZWORD <- chngrownm(ALZWORD)[-1,]

221

ALZWORD <- chngrownm(ALZWORD)[-1,]

230

ALZWORD <- ALZWORD%>%

222

ALZWORD <- ALZWORD%>%

231

as.data.frame()%>%

223

as.data.frame()%>%

232

dplyr::select(-starts_with("col"))

224

dplyr::select(-starts_with("col"))

233

225

234

##Reorganizing information within the columns and final clinical data

226

##Reorganizing information within the columns and final clinical data

235

ALZWORDF <- cinfo(ALZWORD)

227

ALZWORDF <- cinfo(ALZWORD)

236

228

237

229

238

#Working with Actual Data part of file

230

#Working with Actual Data part of file

239

alzdat <- alz %>%

231

alzdat <- alz %>%

240

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

232

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

241

ALZDAT <- t(alzdat[,-1])

233

ALZDAT <- t(alzdat[,-1])

242

rownames(ALZDAT)=NULL

234

rownames(ALZDAT)=NULL

243

235

244

##Is there a clean version of the GPL file available?

236

##Is there a clean version of the GPL file available?

245

gplnum <- strsplit(genena,"[\\|/]") %>%

237

gplnum <- strsplit(genena,"[\\|/]") %>%

246

.[[1]] %>%

238

.[[1]] %>%

247

.[length(.)] %>%

239

.[length(.)] %>%

248

gsub("\\D","",.)

240

gsub("\\D","",.)

249

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

241

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

250

if(clfileex >= 1){

242

if(clfileex >= 1){

251

#use the clean version

243

#use the clean version

252

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

244

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

253

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

245

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

254

246

255

}

247

}

256

else if(clfileex == 0){

248

else if(clfileex == 0){

257

##Lets Create a clean version

249

##Lets Create a clean version

258

250

259

##Gene ID to Gene Name

251

##Gene ID to Gene Name

260

if(soft == TRUE){

252

if(soft == TRUE){

261

#Check to see if there is already a file containing information on soft files

253

#Check to see if there is already a file containing information on soft files

262

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

254

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

263

if(fileex == 1){

255

if(fileex == 1){

264

#Check to see if this GPL soft file has been used before

256

#Check to see if this GPL soft file has been used before

265

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

257

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

266

.$GPL_FILE_NUM%>%

258

.$GPL_FILE_NUM%>%

267

grepl(gplnum,.) %>%

259

grepl(gplnum,.) %>%

268

sum()

260

sum()

269

if(IDF == 1){

261

if(IDF == 1){

270

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

262

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

271

.$GPL_FILE_NUM%>%

263

.$GPL_FILE_NUM%>%

272

grep(gplnum,.)

264

grep(gplnum,.)

273

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

265

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

274

.$LOC_ID %>%

266

.$LOC_ID %>%

275

.[IDLOCAL]

267

.[IDLOCAL]

276

geneIDNam <- genena %>%

268

geneIDNam <- genena %>%

277

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

269

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

278

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

270

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

279

}

271

} else if(IDF == 0){

280

else if(IDF == 0){

281

#No information on this particular GPL file

272

#No information on this particular GPL file

282

idLOCGPL <- genena %>%

273

idLOCGPL <- genena %>%

283

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

274

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

284

t(.) %>%

275

t(.) %>%

285

grep("^ID\\s*$",.) %>%

276

grep("^ID\\s*$",.) %>%

286

-1

277

-1

287

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

278

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

288

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

279

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

289

geneIDNam <- genena %>%

280

geneIDNam <- genena %>%

290

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

281

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

291

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

282

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

292

}

283

}

293

}

284

} else if(fileex == 0){

294

else if(fileex == 0){

295

#We must create a file that we can access for later use

285

#We must create a file that we can access for later use

296

idLOCGPL <- genena %>%

286

idLOCGPL <- genena %>%

297

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

287

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

298

t(.) %>%

288

t(.) %>%

299

grep("^ID\\s*$",.) %>%

289

grep("^ID\\s*$",.) %>%

300

-1

290

-1

301

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

291

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

302

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

292

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

303

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

293

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

304

geneIDNam <- genena %>%

294

geneIDNam <- genena %>%

305

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

295

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

306

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

296

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

307

}

297

}

308

}

298

} else if(soft == FALSE){

309

else if(soft == FALSE){

310

geneIDNam <- genena %>%

299

geneIDNam <- genena %>%

311

read_delim(delim="\t",comment = "#")%>%

300

read_delim(delim="\t",comment = "#")%>%

312

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

301

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

313

}

302

}

314

303

315

##Labeling the gene IDs without names

304

##Labeling the gene IDs without names

316

geneIDNam <- NAFIXING(geneIDNam)

305

geneIDNam <- NAFIXING(geneIDNam)

317

306

318

##remove the whitespace

307

##remove the whitespace

319

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

308

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

320

309

321

##Here is the clean version

310

##Here is the clean version

322

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

311

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

323

}

312

}

324

313

325

314

326

315

327

##Changing the gene ID to gene name

316

##Changing the gene ID to gene name

328

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

317

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

329

colnames(ALZDAT) = ALZDAT1[1,]

318

colnames(ALZDAT) = ALZDAT1[1,]

330

319

331

320

332

##Adjusting the column names aka the gene names

321

##Adjusting the column names aka the gene names

333

colnames(ALZDAT) <- gcnames(ALZDAT)

322

colnames(ALZDAT) <- gcnames(ALZDAT)

334

323

335

324

336

#Full RAW Data

325

#Full RAW Data

337

Fullalzdwr <- ALZDAT %>%

326

Fullalzdwr <- ALZDAT %>%

338

as.data.frame() %>%

327

as.data.frame() %>%

339

cbind(ALZWORDF,.)

328

cbind(ALZWORDF,.)

340

329

341

#Raw file is output

330

#Raw file is output

342

nfnaex <- strsplit(alz,"[\\]") %>%

331

nfnaex <- strsplit(alz,"[\\]") %>%

343

.[[1]] %>%

332

.[[1]] %>%

344

.[length(.)] %>%

333

.[length(.)] %>%

345

gsub("\\D","",.) %>%

334

gsub("\\D","",.) %>%

346

c("GSE",.,"aftexcel.txt") %>%

335

c("GSE",.,"aftexcel.txt") %>%

347

paste(collapse = "")

336

paste(collapse = "")

348

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

337

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

349

338

350

339

351

340

352

#Now for the discretization part

341

#Now for the discretization part

353

##get the wordy part again

342

##get the wordy part again

354

rawword <- t(ALZWORDF)

343

rawword <- t(ALZWORDF)

355

344

356

##where is ID_REF located

345

##where is ID_REF located

357

hereim <- grep("ID_REF",rownames(rawword))

346

hereim <- grep("ID_REF",rownames(rawword))

358

347

359

##Subject Names GSM...

348

##Subject Names GSM...

360

subjnam <- rawword[hereim,]

349

subjnam <- rawword[hereim,]

361

350

362

##Getting the names for the rows

351

##Getting the names for the rows

363

namedarows <- rownames(rawword)[-hereim] %>%

352

namedarows <- rownames(rawword)[-hereim] %>%

364

as.data.frame()

353

as.data.frame()

365

RAWWORD <- rawword[-hereim,] %>%

354

RAWWORD <- rawword[-hereim,] %>%

366

as.data.frame() %>%

355

as.data.frame() %>%

367

bind_cols(namedarows,.)

356

bind_cols(namedarows,.)

368

z <- 1

357

z <- 1

369

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

358

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

370

for(z in 1:dim(RAWWORD)[1]){

359

for(z in 1:dim(RAWWORD)[1]){

371

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

360

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

372

z <- z + 1

361

z <- z + 1

373

}

362

}

374

363

375

colnames(naroww) <- "ROW_NAs"

364

colnames(naroww) <- "ROW_NAs"

376

RAWWORD <- bind_cols(RAWWORD,naroww)

365

RAWWORD <- bind_cols(RAWWORD,naroww)

377

366

378

367

379

roALZna <- t(ALZDAT) %>%

368

roALZna <- t(ALZDAT) %>%

380

rownames(.) %>%

369

rownames(.) %>%

381

as.data.frame(.)

370

as.data.frame(.)

382

colnames(roALZna) <- "ID_REF"

371

colnames(roALZna) <- "ID_REF"

383

372

384

RAWDAT <- t(ALZDAT) %>%

373

RAWDAT <- t(ALZDAT) %>%

385

as.data.frame(.)

374

as.data.frame(.)

386

colnames(RAWDAT) <- NULL

375

colnames(RAWDAT) <- NULL

387

rownames(RAWDAT) <- NULL

376

rownames(RAWDAT) <- NULL

388

377

389

RAWDAT2 <- RAWDAT %>%

378

RAWDAT2 <- RAWDAT %>%

390

cbind(roALZna,.) %>%

379

cbind(roALZna,.) %>%

391

dplyr::arrange(.,ID_REF)

380

dplyr::arrange(.,ID_REF)

392

381

393

##Editing the file for R processing

382

##Editing the file for R processing

394

RAWDATID <- RAWDAT2[,1] %>%

383

RAWDATID <- RAWDAT2[,1] %>%

395

as.matrix(.)

384

as.matrix(.)

396

385

397

RAWDATNUM <- RAWDAT2[,-1] %>%

386

RAWDATNUM <- RAWDAT2[,-1] %>%

398

mapply(.,FUN = as.numeric) %>%

387

mapply(.,FUN = as.numeric) %>%

399

t(.)

388

t(.)

400

389

401

##Consolidating genes with the same name

390

##Consolidating genes with the same name

402

###create empty matrix of size equal to tabRDATID

391

###create empty matrix of size equal to tabRDATID

403

tabRDATID <- table(RAWDATID)

392

tabRDATID <- table(RAWDATID)

404

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

393

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

405

j <- 1

394

j <- 1

406

for(j in 1:length(tabRDATID)){

395

for(j in 1:length(tabRDATID)){

407

##Putting the ones without duplicates in their new homes

396

##Putting the ones without duplicates in their new homes

408

if(tabRDATID[j] == 1){

397

if(tabRDATID[j] == 1){

409

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

398

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

410

}

399

} else if(tabRDATID[j] > 1){

411

##Averaging duplicates and putting them in their new homes

400

##Averaging duplicates and putting them in their new homes

412

else if(tabRDATID[j] > 1){

413

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

401

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

414

}

402

}

415

j <- j + 1

403

j <- j + 1

416

}

404

}

417

405

418

##Scaling the Data

406

##Scaling the Data

419

scrawdat <- NuRDATN%>%

407

scrawdat <- NuRDATN%>%

420

scale()

408

scale()

421

attr(scrawdat,"scaled:center") <- NULL

409

attr(scrawdat,"scaled:center") <- NULL

422

attr(scrawdat,"scaled:scale") <- NULL

410

attr(scrawdat,"scaled:scale") <- NULL

423

colnames(scrawdat) <- rownames(tabRDATID)

411

colnames(scrawdat) <- rownames(tabRDATID)

424

412

425

##Discretized the Data

413

##Discretized the Data

426

dialzdat <- scrawdat %>%

414

dialzdat <- scrawdat %>%

427

dndat(.) %>%

415

dndat(.) %>%

428

t()%>%

416

t()%>%

429

as.data.frame(.)

417

as.data.frame(.)

430

colnames(dialzdat) <- rownames(RAWDATNUM)

418

colnames(dialzdat) <- rownames(RAWDATNUM)

431

419

432

##setting "ID_REF" as a new variable

420

##setting "ID_REF" as a new variable

433

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

421

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

434

colnames(geneNAM) <- "ID_REF"

422

colnames(geneNAM) <- "ID_REF"

435

rownames(dialzdat) <- NULL

423

rownames(dialzdat) <- NULL

436

dialzdat <-bind_cols(geneNAM,dialzdat)

424

dialzdat <-bind_cols(geneNAM,dialzdat)

437

425

438

##NAs in a column

426

##NAs in a column

439

x <- 2

427

x <- 2

440

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

428

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

441

nacol[1,1] = "COL_NAs"

429

nacol[1,1] = "COL_NAs"

442

for(x in 2:dim(dialzdat)[2]){

430

for(x in 2:dim(dialzdat)[2]){

443

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

431

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

444

x <- x + 1

432

x <- x + 1

445

}

433

}

446

colnames(nacol) <- colnames(dialzdat)

434

colnames(nacol) <- colnames(dialzdat)

447

dialzdat <- bind_rows(dialzdat,nacol)

435

dialzdat <- bind_rows(dialzdat,nacol)

448

436

449

##NAs in a row

437

##NAs in a row

450

y <- 1

438

y <- 1

451

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

439

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

452

for(y in 1:dim(dialzdat)[1]){

440

for(y in 1:dim(dialzdat)[1]){

453

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

441

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

454

y <- y + 1

442

y <- y + 1

455

}

443

}

456

colnames(narowd) <- "ROW_NAs"

444

colnames(narowd) <- "ROW_NAs"

457

dialzdat <- bind_cols(dialzdat,narowd)

445

dialzdat <- bind_cols(dialzdat,narowd)

458

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

446

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

459

colnames(RAWWORD) <- colnames(dialzdat)

447

colnames(RAWWORD) <- colnames(dialzdat)

460

##converting to character so that the clinical can be brought together with discrete data

448

##converting to character so that the clinical can be brought together with discrete data

461

k <- 2

449

k <- 2

462

for(k in 2:dim(dialzdat)[2]-1){

450

for(k in 2:dim(dialzdat)[2]-1){

463

dialzdat[,k] <- as.character(dialzdat[,k])

451

dialzdat[,k] <- as.character(dialzdat[,k])

464

k <- k + 1

452

k <- k + 1

465

}

453

}

466

#The End the full data

454

#The End the full data

467

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

455

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

468

456

469

#Produces Discrete file

457

#Produces Discrete file

470

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

458

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

471

.[[1]] %>%

459

.[[1]] %>%

472

.[length(.)] %>%

460

.[length(.)] %>%

473

gsub("\\D","",.) %>%

461

gsub("\\D","",.) %>%

474

c("GSE",.,"dscrt.txt") %>%

462

c("GSE",.,"dscrt.txt") %>%

475

paste(collapse = "")

463

paste(collapse = "")

476

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

464

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

477

n <- n +1

465

n <- n +1

478

}

466

}

479

}

467

} else if(numDAT == 2){

480

481

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

468

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

482

else if(numDAT == 2){

469

483

#All the files you want to analyze

470

#All the files you want to analyze

484

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

471

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

485

if(length(ANDIS) == 0){

472

if(length(ANDIS) == 0){

486

#Spit out a warning

473

#Spit out a warning

487

warning("You did not select any files and so no cleaning will be performed")

474

warning("You did not select any files and so no cleaning will be performed")

488

} else{

475

} else{

489

#indexing the data files

476

#indexing the data files

490

n <- 1

477

n <- 1

491

for(n in 1: length(ANDIS)){

478

for(n in 1: length(ANDIS)){

492

alz <- ANDIS[n]

479

alz <- ANDIS[n]

493

480

494

#Working with the wordy part of the document

481

#Working with the wordy part of the document

495

alzword <- alz %>%

482

alzword <- alz %>%

496

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

483

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

497

filter(grepl("!Sample",X1))%>%

484

filter(grepl("!Sample",X1))%>%

498

filter(!grepl("!Sample_contact",X1))

485

filter(!grepl("!Sample_contact",X1))

499

486

500

#Getting the GPL file

487

#Getting the GPL file

501

genena <- grep("_platform_id",alzword$X1) %>%

488

genena <- grep("_platform_id",alzword$X1) %>%

502

alzword$X2[.] %>%

489

alzword$X2[.] %>%

503

str_trim(.) %>%

490

str_trim(.) %>%

504

paste0("^",.,"\\D") %>%

491

paste0("^",.,"\\D") %>%

505

grep(.,list.files()) %>%

492

grep(.,list.files()) %>%

506

list.files()[.]

493

list.files()[.]

507

494

508

#Find out if it is a soft GPL file or not

495

#Find out if it is a soft GPL file or not

509

soft <- strsplit(genena,"[\\|/]") %>%

496

soft <- strsplit(genena,"[\\|/]") %>%

510

.[[1]] %>%

497

.[[1]] %>%

511

.[length(.)] %>%

498

.[length(.)] %>%

512

grepl("soft",.)

499

grepl("soft",.)

513

500

514

##Changing row names and column names:

501

##Changing row names and column names:

515

ALZWORD <- t(alzword)

502

ALZWORD <- t(alzword)

516

rownames(ALZWORD)=NULL

503

rownames(ALZWORD)=NULL

517

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

504

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

518

ALZWORD <- chngrownm(ALZWORD)[-1,]

505

ALZWORD <- chngrownm(ALZWORD)[-1,]

519

ALZWORD <- ALZWORD%>%

506

ALZWORD <- ALZWORD%>%

520

as.data.frame()%>%

507

as.data.frame()%>%

521

dplyr::select(-starts_with("col"))

508

dplyr::select(-starts_with("col"))

522

509

523

##Reorganizing information within the columns and final clinical data

510

##Reorganizing information within the columns and final clinical data

524

ALZWORDF <- cinfo(ALZWORD)

511

ALZWORDF <- cinfo(ALZWORD)

525

512

526

513

527

#Working with Actual Data part of file

514

#Working with Actual Data part of file

528

alzdat <- alz %>%

515

alzdat <- alz %>%

529

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

516

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

530

ALZDAT <- t(alzdat[,-1])

517

ALZDAT <- t(alzdat[,-1])

531

rownames(ALZDAT)=NULL

518

rownames(ALZDAT)=NULL

532

519

533

##Is there a clean version of the GPL file available?

520

##Is there a clean version of the GPL file available?

534

gplnum <- strsplit(genena,"[\\|/]") %>%

521

gplnum <- strsplit(genena,"[\\|/]") %>%

535

.[[1]] %>%

522

.[[1]] %>%

536

.[length(.)] %>%

523

.[length(.)] %>%

537

gsub("\\D","",.)

524

gsub("\\D","",.)

538

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

525

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

539

if(clfileex >= 1){

526

if(clfileex >= 1){

540

#use the clean version

527

#use the clean version

541

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

528

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

542

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

529

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

543

530

544

}

531

} else if(clfileex == 0){

545

else if(clfileex == 0){

546

##Lets Create a clean version

532

##Lets Create a clean version

547

533

548

##Gene ID to Gene Name

534

##Gene ID to Gene Name

549

if(soft == TRUE){

535

if(soft == TRUE){

550

#Check to see if there is already a file containing information on soft files

536

#Check to see if there is already a file containing information on soft files

551

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

537

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

552

if(fileex == 1){

538

if(fileex == 1){

553

#Check to see if this GPL soft file has been used before

539

#Check to see if this GPL soft file has been used before

554

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

540

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

555

.$GPL_FILE_NUM%>%

541

.$GPL_FILE_NUM%>%

556

grepl(gplnum,.) %>%

542

grepl(gplnum,.) %>%

557

sum()

543

sum()

558

if(IDF == 1){

544

if(IDF == 1){

559

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

545

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

560

.$GPL_FILE_NUM%>%

546

.$GPL_FILE_NUM%>%

561

grep(gplnum,.)

547

grep(gplnum,.)

562

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

548

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

563

.$LOC_ID %>%

549

.$LOC_ID %>%

564

.[IDLOCAL]

550

.[IDLOCAL]

565

geneIDNam <- genena %>%

551

geneIDNam <- genena %>%

566

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

552

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

567

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

553

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

568

}

554

} else if(IDF == 0){

569

else if(IDF == 0){

570

#No information on this particular GPL file

555

#No information on this particular GPL file

571

idLOCGPL <- genena %>%

556

idLOCGPL <- genena %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

557

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

573

t(.) %>%

558

t(.) %>%

574

grep("^ID\\s*$",.) %>%

559

grep("^ID\\s*$",.) %>%

575

-1

560

-1

576

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

561

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

577

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

562

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

578

geneIDNam <- genena %>%

563

geneIDNam <- genena %>%

579

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

564

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

580

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

565

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

581

}

566

}

582

}

567

} else if(fileex == 0){

583

else if(fileex == 0){

584

#We must create a file that we can access for later use

568

#We must create a file that we can access for later use

585

idLOCGPL <- genena %>%

569

idLOCGPL <- genena %>%

586

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

570

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

587

t(.) %>%

571

t(.) %>%

588

grep("^ID\\s*$",.) %>%

572

grep("^ID\\s*$",.) %>%

589

-1

573

-1

590

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

574

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

591

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

575

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

592

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

576

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

593

geneIDNam <- genena %>%

577

geneIDNam <- genena %>%

594

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

578

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

595

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

579

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

596

}

580

}

597

}

581

} else if(soft == FALSE){

598

else if(soft == FALSE){

599

geneIDNam <- genena %>%

582

geneIDNam <- genena %>%

600

read_delim(delim="\t",comment = "#")%>%

583

read_delim(delim="\t",comment = "#")%>%

601

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

584

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

602

}

585

}

603

586

604

##Labeling the gene IDs without names

587

##Labeling the gene IDs without names

605

geneIDNam <- NAFIXING(geneIDNam)

588

geneIDNam <- NAFIXING(geneIDNam)

606

589

607

##remove the whitespace

590

##remove the whitespace

608

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

591

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

609

592

610

##Here is the clean version

593

##Here is the clean version

611

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

594

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

612

}

595

}

613

596

614

597

615

598

616

##Changing the gene ID to gene name

599

##Changing the gene ID to gene name

617

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

600

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

618

colnames(ALZDAT) = ALZDAT1[1,]

601

colnames(ALZDAT) = ALZDAT1[1,]

619

602

620

603

621

##Adjusting the column names aka the gene names

604

##Adjusting the column names aka the gene names

622

colnames(ALZDAT) <- gcnames(ALZDAT)

605

colnames(ALZDAT) <- gcnames(ALZDAT)

623

606

624

607

625

#Full RAW Data

608

#Full RAW Data

626

Fullalzdwr <- ALZDAT %>%

609

Fullalzdwr <- ALZDAT %>%

627

as.data.frame() %>%

610

as.data.frame() %>%

628

cbind(ALZWORDF,.)

611

cbind(ALZWORDF,.)

629

612

630

#Raw file is output

613

#Raw file is output

631

nfnaex <- strsplit(alz,"[\\]") %>%

614

nfnaex <- strsplit(alz,"[\\]") %>%

632

.[[1]] %>%

615

.[[1]] %>%

633

.[length(.)] %>%

616

.[length(.)] %>%

634

gsub("\\D","",.) %>%

617

gsub("\\D","",.) %>%

635

c("GSE",.,"aftexcel.txt") %>%

618

c("GSE",.,"aftexcel.txt") %>%

636

paste(collapse = "")

619

paste(collapse = "")

637

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

620

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

638

621

639

622

640

623

641

#Now for the discretization part

624

#Now for the discretization part

642

##get the wordy part again

625

##get the wordy part again

643

rawword <- t(ALZWORDF)

626

rawword <- t(ALZWORDF)

644

627

645

##where is ID_REF located

628

##where is ID_REF located

646

hereim <- grep("ID_REF",rownames(rawword))

629

hereim <- grep("ID_REF",rownames(rawword))

647

630

648

##Subject Names GSM...

631

##Subject Names GSM...

649

subjnam <- rawword[hereim,]

632

subjnam <- rawword[hereim,]

650

633

651

##Getting the names for the rows

634

##Getting the names for the rows

652

namedarows <- rownames(rawword)[-hereim] %>%

635

namedarows <- rownames(rawword)[-hereim] %>%

653

as.data.frame()

636

as.data.frame()

654

RAWWORD <- rawword[-hereim,] %>%

637

RAWWORD <- rawword[-hereim,] %>%

655

as.data.frame() %>%

638

as.data.frame() %>%

656

bind_cols(namedarows,.)

639

bind_cols(namedarows,.)

657

z <- 1

640

z <- 1

658

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

641

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

659

for(z in 1:dim(RAWWORD)[1]){

642

for(z in 1:dim(RAWWORD)[1]){

660

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

643

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

661

z <- z + 1

644

z <- z + 1

662

}

645

}

663

646

664

colnames(naroww) <- "ROW_NAs"

647

colnames(naroww) <- "ROW_NAs"

665

RAWWORD <- bind_cols(RAWWORD,naroww)

648

RAWWORD <- bind_cols(RAWWORD,naroww)

666

649

667

650

668

roALZna <- t(ALZDAT) %>%

651

roALZna <- t(ALZDAT) %>%

669

rownames(.) %>%

652

rownames(.) %>%

670

as.data.frame(.)

653

as.data.frame(.)

671

colnames(roALZna) <- "ID_REF"

654

colnames(roALZna) <- "ID_REF"

672

655

673

RAWDAT <- t(ALZDAT) %>%

656

RAWDAT <- t(ALZDAT) %>%

674

as.data.frame(.)

657

as.data.frame(.)

675

colnames(RAWDAT) <- NULL

658

colnames(RAWDAT) <- NULL

676

rownames(RAWDAT) <- NULL

659

rownames(RAWDAT) <- NULL

677

660

678

RAWDAT2 <- RAWDAT %>%

661

RAWDAT2 <- RAWDAT %>%

679

cbind(roALZna,.) %>%

662

cbind(roALZna,.) %>%

680

dplyr::arrange(.,ID_REF)

663

dplyr::arrange(.,ID_REF)

681

664

682

##Editing the file for R processing

665

##Editing the file for R processing

683

RAWDATID <- RAWDAT2[,1] %>%

666

RAWDATID <- RAWDAT2[,1] %>%

684

as.matrix(.)

667

as.matrix(.)

685

668

686

RAWDATNUM <- RAWDAT2[,-1] %>%

669

RAWDATNUM <- RAWDAT2[,-1] %>%

687

mapply(.,FUN = as.numeric) %>%

670

mapply(.,FUN = as.numeric) %>%

688

t(.)

671

t(.)

689

672

690

##Consolidating genes with the same name

673

##Consolidating genes with the same name

691

###create empty matrix of size equal to tabRDATID

674

###create empty matrix of size equal to tabRDATID

692

tabRDATID <- table(RAWDATID)

675

tabRDATID <- table(RAWDATID)

693

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

676

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

694

j <- 1

677

j <- 1

695

for(j in 1:length(tabRDATID)){

678

for(j in 1:length(tabRDATID)){

696

##Putting the ones without duplicates in their new homes

679

##Putting the ones without duplicates in their new homes

697

if(tabRDATID[j] == 1){

680

if(tabRDATID[j] == 1){

698

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

681

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

699

}

682

} else if(tabRDATID[j] > 1){

700

##Averaging duplicates and putting them in their new homes

683

##Averaging duplicates and putting them in their new homes

701

else if(tabRDATID[j] > 1){

702

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

684

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

703

}

685

}

704

j <- j + 1

686

j <- j + 1

705

}

687

}

706

688

707

##Scaling the Data

689

##Scaling the Data

708

scrawdat <- NuRDATN%>%

690

scrawdat <- NuRDATN%>%

709

scale()

691

scale()

710

attr(scrawdat,"scaled:center") <- NULL

692

attr(scrawdat,"scaled:center") <- NULL

711

attr(scrawdat,"scaled:scale") <- NULL

693

attr(scrawdat,"scaled:scale") <- NULL

712

colnames(scrawdat) <- rownames(tabRDATID)

694

colnames(scrawdat) <- rownames(tabRDATID)

713

695

714

##Discretized the Data

696

##Discretized the Data

715

dialzdat <- scrawdat %>%

697

dialzdat <- scrawdat %>%

716

dndat(.) %>%

698

dndat(.) %>%

717

t()%>%

699

t()%>%

718

as.data.frame(.)

700

as.data.frame(.)

719

colnames(dialzdat) <- rownames(RAWDATNUM)

701

colnames(dialzdat) <- rownames(RAWDATNUM)

720

702

721

##setting "ID_REF" as a new variable

703

##setting "ID_REF" as a new variable

722

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

704

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

723

colnames(geneNAM) <- "ID_REF"

705

colnames(geneNAM) <- "ID_REF"

724

rownames(dialzdat) <- NULL

706

rownames(dialzdat) <- NULL

725

dialzdat <-bind_cols(geneNAM,dialzdat)

707

dialzdat <-bind_cols(geneNAM,dialzdat)

726

708

727

##NAs in a column

709

##NAs in a column

728

x <- 2

710

x <- 2

729

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

711

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

730

nacol[1,1] = "COL_NAs"

712

nacol[1,1] = "COL_NAs"

731

for(x in 2:dim(dialzdat)[2]){

713

for(x in 2:dim(dialzdat)[2]){

732

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

714

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

733

x <- x + 1

715

x <- x + 1

734

}

716

}

735

colnames(nacol) <- colnames(dialzdat)

717

colnames(nacol) <- colnames(dialzdat)

736

dialzdat <- bind_rows(dialzdat,nacol)

718

dialzdat <- bind_rows(dialzdat,nacol)

737

719

738

##NAs in a row

720

##NAs in a row

739

y <- 1

721

y <- 1

740

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

722

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

741

for(y in 1:dim(dialzdat)[1]){

723

for(y in 1:dim(dialzdat)[1]){

742

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

724

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

743

y <- y + 1

725

y <- y + 1

744

}

726

}

745

colnames(narowd) <- "ROW_NAs"

727

colnames(narowd) <- "ROW_NAs"

746

dialzdat <- bind_cols(dialzdat,narowd)

728

dialzdat <- bind_cols(dialzdat,narowd)

747

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

729

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

748

colnames(RAWWORD) <- colnames(dialzdat)

730

colnames(RAWWORD) <- colnames(dialzdat)

749

##converting to character so that the clinical can be brought together with discrete data

731

##converting to character so that the clinical can be brought together with discrete data

750

k <- 2

732

k <- 2

751

for(k in 2:dim(dialzdat)[2]-1){

733

for(k in 2:dim(dialzdat)[2]-1){

752

dialzdat[,k] <- as.character(dialzdat[,k])

734

dialzdat[,k] <- as.character(dialzdat[,k])

753

k <- k + 1

735

k <- k + 1

754

}

736

}

755

#The End the full data

737

#The End the full data

756

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

738

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

757

739

758

#Produces Discrete file

740

#Produces Discrete file

759

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

741

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

760

.[[1]] %>%

742

.[[1]] %>%

761

.[length(.)] %>%

743

.[length(.)] %>%

762

gsub("\\D","",.) %>%

744

gsub("\\D","",.) %>%

763

c("GSE",.,"dscrt.txt") %>%

745

c("GSE",.,"dscrt.txt") %>%

764

paste(collapse = "")

746

paste(collapse = "")

765

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

747

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

766

748

767

749

768

n <- n + 1

750

n <- n + 1

769

}

751

}

770

}

752

}

771

}

753

}

772

}

754

}

773

#The Rest of this code will be used every time you want to change a data set

755

#The Rest of this code will be used every time you want to change a data set

774

THEFT()

756

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Don't use this code yet