Efrain Gonzalez / Cleaning and Fixing Data with R

1

2

#Efrain H. Gonzalez

1

#Efrain H. Gonzalez

3

#6/22/2017

2

#6/25/2018

4

options(digits = 11)

3

options(digits = 11)

5

#Libraries required to run the code

4

#Libraries required to run the code

6

library(pryr)

5

library(pryr)

7

library(MASS)

6

library(MASS)

8

library(dplyr)

7

library(dplyr)

9

library(tidyr)

8

library(tidyr)

10

library(readr)

9

library(readr)

11

library(stringr)

10

library(stringr)

12

11

13

12

14

#Necessary Functions

13

#Necessary Functions

15

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

16

chngrownm <- function(mat){

15

chngrownm <- function(mat){

17

row <- dim(mat)[1]

16

row <- dim(mat)[1]

18

col <- dim(mat)[2]

17

col <- dim(mat)[2]

19

e <- 1

18

e <- 1

20

r <- 1

19

r <- 1

21

a <- 1

20

a <- 1

22

h <- 1

21

h <- 1

23

g <- 1

22

g <- 1

24

o <- 1

23

o <- 1

25

for(e in 1:col){

24

for(e in 1:col){

26

if("!Sample_source_name_ch1"==mat[1,e]){

25

if("!Sample_source_name_ch1"==mat[1,e]){

27

colnames(mat)[e] <- "Brain_Region"

26

colnames(mat)[e] <- "Brain_Region"

28

} else if("!Sample_title" == mat[1,e]){

27

} else if("!Sample_title" == mat[1,e]){

29

colnames(mat)[e] <- "Title"

28

colnames(mat)[e] <- "Title"

30

} else if("!Sample_geo_accession" == mat[1,e]){

29

} else if("!Sample_geo_accession" == mat[1,e]){

31

colnames(mat)[e] <- "ID_REF"

30

colnames(mat)[e] <- "ID_REF"

32

} else{

31

} else{

33

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

32

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

34

colnames(mat)[e] <- paste0("Sex",r)

33

colnames(mat)[e] <- paste0("Sex",r)

35

r = r + 1

34

r = r + 1

36

}

35

}

37

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

36

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

38

colnames(mat)[e] <- paste0("PMI",a)

37

colnames(mat)[e] <- paste0("PMI",a)

39

a = a + 1

38

a = a + 1

40

}

39

}

41

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

40

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

42

colnames(mat)[e] <- paste0("Age",h)

41

colnames(mat)[e] <- paste0("Age",h)

43

h = h + 1

42

h = h + 1

44

}

43

}

45

if(grepl("braak|b&b",mat[2,e])==TRUE){

44

if(grepl("braak|b&b",mat[2,e])==TRUE){

46

colnames(mat)[e] <- paste0("Braak",g)

45

colnames(mat)[e] <- paste0("Braak",g)

47

g = g + 1

46

g = g + 1

48

}

47

}

49

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

50

colnames(mat)[e] <- paste0("Group",o)

49

colnames(mat)[e] <- paste0("Group",o)

51

o = o + 1

50

o = o + 1

52

}

51

}

53

52

54

}

53

}

55

e = e + 1

54

e = e + 1

56

}

55

}

57

mat

56

mat

58

}

57

}

59

58

60

#2#Function for reorganizing information within the columns

59

#2#Function for reorganizing information within the columns

61

cinfo <- function(mat){

60

cinfo <- function(mat){

62

col <- dim(mat)[2]

61

col <- dim(mat)[2]

63

j <-2

62

j <-2

64

for(j in 2:col){

63

for(j in 2:col){

65

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

66

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

67

} else if(grepl("Age",colnames(mat)[j])==TRUE){

66

} else if(grepl("Age",colnames(mat)[j])==TRUE){

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

69

as.integer()

68

as.integer()

70

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

69

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

70

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

71

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

72

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

as.numeric()

73

as.numeric()

75

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

74

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

76

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

75

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

77

as.roman()%>%

76

as.roman()%>%

78

as.integer()

77

as.integer()

79

}

78

}

80

j=j+1

79

j=j+1

81

}

80

}

82

mat

81

mat

83

}

82

}

84

83

85

#3#Function for labeling the gene IDs without names

84

#3#Function for labeling the gene IDs without names

86

NAFIXING <- function(GIDNAM){

85

NAFIXING <- function(GIDNAM){

87

row <- dim(GIDNAM)[1]

86

row <- dim(GIDNAM)[1]

88

i <- 1

87

i <- 1

89

for(i in 1:row){

88

for(i in 1:row){

90

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

89

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

91

GIDNAM[i,2] <- GIDNAM[i,1]

90

GIDNAM[i,2] <- GIDNAM[i,1]

92

}

91

}

93

i <- i + 1

92

i <- i + 1

94

}

93

}

95

GIDNAM

94

GIDNAM

96

}

95

}

97

96

98

#4#Function for changing the gene ID to gene name

97

#4#Function for changing the gene ID to gene name

99

cgeneID <- function(GeneName,DATA){

98

cgeneID <- function(GeneName,DATA){

100

nj <- t(GeneName)

99

nj <- t(GeneName)

101

nq <- t(DATA)

100

nq <- t(DATA)

102

colGene <- dim(nj)[2]

101

colGene <- dim(nj)[2]

103

colDATA <- dim(nq)[2]

102

colDATA <- dim(nq)[2]

104

j <- 1

103

j <- 1

105

for(j in 1:colDATA){

104

for(j in 1:colDATA){

106

#where is that gene id located within the GPL file

105

#where is that gene id located within the GPL file

107

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

106

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

108

if(is.na(sum(chngreq))==FALSE){

107

if(is.na(sum(chngreq))==FALSE){

109

if(sum(chngreq) > 0){

108

if(sum(chngreq) > 0){

110

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

109

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

111

}

110

}

112

}

111

}

113

j <- j + 1

112

j <- j + 1

114

}

113

}

115

nq

114

nq

116

}

115

}

117

#cgeneID <- function(GeneName,DATA){

116

#cgeneID <- function(GeneName,DATA){

118

# colGene <- dim(GeneName)[2]

117

# colGene <- dim(GeneName)[2]

119

# j <- 1

118

# j <- 1

120

# for(j in 1:colGene){

119

# for(j in 1:colGene){

121

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

120

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

122

# if(is.na(sum(chngsreq))==FALSE){

121

# if(is.na(sum(chngsreq))==FALSE){

123

# if(sum(chngsreq) > 0){

122

# if(sum(chngsreq) > 0){

124

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

123

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

125

# }

124

# }

126

# }

125

# }

127

# j = j+1

126

# j = j+1

128

# }

127

# }

129

# DATA

128

# DATA

130

#}

129

#}

131

130

132

#5#Function for adjusting the gene names

131

#5#Function for adjusting the gene names

133

gcnames <- function(DiData,usecol=1){

132

gcnames <- function(DiData,usecol=1){

134

nuruns <- dim(DiData)[2]

133

nuruns <- dim(DiData)[2]

135

i = 1

134

i = 1

136

nwnam <- rep("0",length.out=nuruns)

135

nwnam <- rep("0",length.out=nuruns)

137

for(i in 1:nuruns){

136

for(i in 1:nuruns){

138

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

137

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

139

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

138

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

140

} else{

139

} else{

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

140

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

142

}

141

}

143

142

144

}

143

}

145

nwnam

144

nwnam

146

145

147

}

146

}

148

147

149

#6# Function for discretizing the data

148

#6# Function for discretizing the data

150

dndat <- function(NDATA){

149

dndat <- function(NDATA){

151

rownd <- dim(NDATA)[1]

150

rownd <- dim(NDATA)[1]

152

colnd <- dim(NDATA)[2]

151

colnd <- dim(NDATA)[2]

153

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

152

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

154

colnames(DDATA) <- colnames(NDATA)

153

colnames(DDATA) <- colnames(NDATA)

155

i <- 1

154

i <- 1

156

for(i in 1:rownd){

155

for(i in 1:rownd){

157

j <- 1

156

j <- 1

158

for(j in 1:colnd){

157

for(j in 1:colnd){

159

if(is.na(NDATA[i,j])==FALSE){

158

if(is.na(NDATA[i,j])==FALSE){

160

159

161

if(NDATA[i,j] < -1){

160

if(NDATA[i,j] < -1){

162

DDATA[i,j]=0L

161

DDATA[i,j]=0L

163

} else if(NDATA[i,j] > 1){

162

} else if(NDATA[i,j] > 1){

164

DDATA[i,j]=2L

163

DDATA[i,j]=2L

165

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

164

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

166

DDATA[i,j]=1L

165

DDATA[i,j]=1L

167

}

166

}

168

} else{

167

} else{

169

DDATA[i,j] = NDATA[i,j]

168

DDATA[i,j] = NDATA[i,j]

170

}

169

}

171

j = j + 1

170

j = j + 1

172

}

171

}

173

i = i + 1

172

i = i + 1

174

}

173

}

175

DDATA

174

DDATA

176

}

175

}

177

176

178

177

179

#MajorFunction#This is the function that does everything else

178

#MajorFunction#This is the function that does everything else

180

THEFT <- function(){

179

THEFT <- function(){

181

#Set working directory based on the directory of the series matrix file Currently only works for windows

180

#Set working directory based on the directory of the series matrix file Currently only works for windows

182

wd <- getwd()

181

wd <- getwd()

183

#list.files()

182

#list.files()

184

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

183

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

185

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

184

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

186

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

185

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

187

GSEfloc <- list.files()[GSEfileloc]

186

GSEfloc <- list.files()[GSEfileloc]

188

#ALL DATA FILES WILL BE CLEANED

187

#ALL DATA FILES WILL BE CLEANED

189

if(numDAT == 1){

188

if(numDAT == 1){

190

#indexing the data files

189

#indexing the data files

191

n <- 1

190

n <- 1

192

for(n in 1: length(GSEfloc)){

191

for(n in 1: length(GSEfloc)){

193

alz <- GSEfloc[n]

192

alz <- GSEfloc[n]

194

193

195

#Working with the wordy part of the document

194

#Working with the wordy part of the document

196

alzword <- alz %>%

195

alzword <- alz %>%

197

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

196

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

198

filter(grepl("!Sample",X1))%>%

197

filter(grepl("!Sample",X1))%>%

199

filter(!grepl("!Sample_contact",X1))

198

filter(!grepl("!Sample_contact",X1))

200

199

201

#Getting the GPL file

200

#Getting the GPL file

202

genena <- grep("_platform_id",alzword$X1) %>%

201

genena <- grep("_platform_id",alzword$X1) %>%

203

alzword$X2[.] %>%

202

alzword$X2[.] %>%

204

str_trim(.) %>%

203

str_trim(.) %>%

205

paste0("^",.,"\\D") %>%

204

paste0("^",.,"\\D") %>%

206

grep(.,list.files()) %>%

205

grep(.,list.files()) %>%

207

list.files()[.]

206

list.files()[.]

208

207

209

#Find out if it is a soft GPL file or not

208

#Find out if it is a soft GPL file or not

210

soft <- strsplit(genena,"[\\|/]") %>%

209

soft <- strsplit(genena,"[\\|/]") %>%

211

.[[1]] %>%

210

.[[1]] %>%

212

.[length(.)] %>%

211

.[length(.)] %>%

213

grepl("soft",.)

212

grepl("soft",.)

214

213

215

##Changing row names and column names:

214

##Changing row names and column names:

216

ALZWORD <- t(alzword)

215

ALZWORD <- t(alzword)

217

rownames(ALZWORD)=NULL

216

rownames(ALZWORD)=NULL

218

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

217

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

219

ALZWORD <- chngrownm(ALZWORD)[-1,]

218

ALZWORD <- chngrownm(ALZWORD)[-1,]

220

ALZWORD <- ALZWORD%>%

219

ALZWORD <- ALZWORD%>%

221

as.data.frame(.,stringsAsFactors = FALSE)%>%

220

as.data.frame(.,stringsAsFactors = FALSE)%>%

222

dplyr::select(-starts_with("col"))

221

dplyr::select(-starts_with("col"))

223

222

224

##Reorganizing information within the columns and final clinical data

223

##Reorganizing information within the columns and final clinical data

225

ALZWORDF <- cinfo(ALZWORD)

224

ALZWORDF <- cinfo(ALZWORD)

226

225

227

226

228

#Working with Actual Data part of file

227

#Working with Actual Data part of file

229

alzdat <- alz %>%

228

alzdat <- alz %>%

230

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

229

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

231

ALZDAT <- t(alzdat[,-1])

230

ALZDAT <- t(alzdat[,-1])

232

rownames(ALZDAT)=NULL

231

rownames(ALZDAT)=NULL

233

232

234

##Is there a clean version of the GPL file available?

233

##Is there a clean version of the GPL file available?

235

gplnum <- strsplit(genena,"[\\|/]") %>%

234

gplnum <- strsplit(genena,"[\\|/]") %>%

236

.[[1]] %>%

235

.[[1]] %>%

237

.[length(.)] %>%

236

.[length(.)] %>%

238

gsub("\\D","",.)

237

gsub("\\D","",.)

239

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

238

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

240

if(clfileex >= 1){

239

if(clfileex >= 1){

241

#use the clean version

240

#use the clean version

242

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

241

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

243

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

242

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

244

243

245

} else if(clfileex == 0){

244

} else if(clfileex == 0){

246

##Lets Create a clean version

245

##Lets Create a clean version

247

246

248

##Gene ID to Gene Name

247

##Gene ID to Gene Name

249

if(soft == TRUE){

248

if(soft == TRUE){

250

#Check to see if there is already a file containing information on soft files

249

#Check to see if there is already a file containing information on soft files

251

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

250

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

252

if(fileex == 1){

251

if(fileex == 1){

253

#Check to see if this GPL soft file has been used before

252

#Check to see if this GPL soft file has been used before

254

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

253

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

255

.$GPL_FILE_NUM%>%

254

.$GPL_FILE_NUM%>%

256

grepl(gplnum,.) %>%

255

grepl(gplnum,.) %>%

257

sum()

256

sum()

258

if(IDF == 1){

257

if(IDF == 1){

259

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

258

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

260

.$GPL_FILE_NUM%>%

259

.$GPL_FILE_NUM%>%

261

grep(gplnum,.)

260

grep(gplnum,.)

262

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

261

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

263

.$LOC_ID %>%

262

.$LOC_ID %>%

264

.[IDLOCAL]

263

.[IDLOCAL]

265

geneIDNam <- genena %>%

264

geneIDNam <- genena %>%

266

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

265

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

267

266

268

} else if(IDF == 0){

267

} else if(IDF == 0){

269

#No information on this particular GPL file

268

#No information on this particular GPL file

270

idLOCGPL <- genena %>%

269

idLOCGPL <- genena %>%

271

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

270

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

272

t(.) %>%

271

t(.) %>%

273

grep("^ID\\s*$",.) %>%

272

grep("^ID\\s*$",.) %>%

274

-1

273

-1

275

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

274

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

276

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

275

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

277

geneIDNam <- genena %>%

276

geneIDNam <- genena %>%

278

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

277

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

279

278

280

}

279

}

281

} else if(fileex == 0){

280

} else if(fileex == 0){

282

#We must create a file that we can access for later use

281

#We must create a file that we can access for later use

283

idLOCGPL <- genena %>%

282

idLOCGPL <- genena %>%

284

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

283

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

285

t(.) %>%

284

t(.) %>%

286

grep("^ID\\s*$",.) %>%

285

grep("^ID\\s*$",.) %>%

287

-1

286

-1

288

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

287

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

289

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

288

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

290

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

289

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

291

geneIDNam <- genena %>%

290

geneIDNam <- genena %>%

292

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

291

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

293

292

294

}

293

}

295

} else if(soft == FALSE){

294

} else if(soft == FALSE){

296

geneIDNam <- genena %>%

295

geneIDNam <- genena %>%

297

read_delim(delim="\t",comment = "#")%>%

296

read_delim(delim="\t",comment = "#")%>%

298

297

299

}

298

}

300

299

301

##Labeling the gene IDs without names

300

##Labeling the gene IDs without names

302

geneIDNam <- NAFIXING(geneIDNam)

301

geneIDNam <- NAFIXING(geneIDNam)

303

302

304

##remove the whitespace

303

##remove the whitespace

305

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

304

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

306

305

307

##Here is the clean version

306

##Here is the clean version

308

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

307

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

309

}

308

}

310

309

311

310

312

311

313

##Changing the gene ID to gene name

312

##Changing the gene ID to gene name

314

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

313

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

315

colnames(ALZDAT) = ALZDAT1[1,]

314

colnames(ALZDAT) = ALZDAT1[1,]

316

315

317

316

318

##Adjusting the column names aka the gene names

317

##Adjusting the column names aka the gene names

319

colnames(ALZDAT) <- gcnames(ALZDAT)

318

colnames(ALZDAT) <- gcnames(ALZDAT)

320

319

321

320

322

#Full RAW Data

321

#Full RAW Data

323

Fullalzdwr <- ALZDAT %>%

322

Fullalzdwr <- ALZDAT %>%

324

as.data.frame(.,stringsAsFactors = FALSE) %>%

323

as.data.frame(.,stringsAsFactors = FALSE) %>%

325

cbind(ALZWORDF,.)

324

cbind(ALZWORDF,.)

326

325

327

#Raw file is output

326

#Raw file is output

328

nfnaex <- strsplit(alz,"[\\]") %>%

327

nfnaex <- strsplit(alz,"[\\|/]") %>%

329

.[[1]] %>%

328

.[[1]] %>%

330

.[length(.)] %>%

329

.[length(.)] %>%

331

gsub("\\D","",.) %>%

330

gsub("\\D","",.) %>%

332

c("GSE",.,"aftexcel.txt") %>%

331

c("GSE",.,"aftexcel.txt") %>%

333

paste(collapse = "")

332

paste(collapse = "")

334

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

333

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

335

334

336

335

337

336

338

#Now for the discretization part

337

#Now for the discretization part

339

##get the wordy part again

338

##get the wordy part again

340

rawword <- t(ALZWORDF)

339

rawword <- t(ALZWORDF)

341

340

342

##where is ID_REF located

341

##where is ID_REF located

343

hereim <- grep("ID_REF",rownames(rawword))

342

hereim <- grep("ID_REF",rownames(rawword))

344

343

345

##Subject Names GSM...

344

##Subject Names GSM...

346

subjnam <- rawword[hereim,]

345

subjnam <- rawword[hereim,]

347

346

348

##Getting the names for the rows

347

##Getting the names for the rows

349

namedarows <- rownames(rawword)[-hereim] %>%

348

namedarows <- rownames(rawword)[-hereim] %>%

350

as.data.frame(.,stringsAsFactors = FALSE)

349

as.data.frame(.,stringsAsFactors = FALSE)

351

RAWWORD <- rawword[-hereim,] %>%

350

RAWWORD <- rawword[-hereim,] %>%

352

as.data.frame(.,stringsAsFactors = FALSE) %>%

351

as.data.frame(.,stringsAsFactors = FALSE) %>%

353

bind_cols(namedarows,.)

352

bind_cols(namedarows,.)

354

z <- 1

353

z <- 1

355

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

354

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

356

for(z in 1:dim(RAWWORD)[1]){

355

for(z in 1:dim(RAWWORD)[1]){

357

if(sum(is.na(RAWWORD[z,])) > 0){

356

if(sum(is.na(RAWWORD[z,])) > 0){

358

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

357

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

359

}

358

}

360

if(length(grep("NA",RAWWORD[z,])) > 0){

359

if(length(grep("NA",RAWWORD[z,])) > 0){

361

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

360

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

362

}

361

}

363

z <- z + 1

362

z <- z + 1

364

}

363

}

365

364

366

colnames(naroww) <- "ROW_NAs"

365

colnames(naroww) <- "ROW_NAs"

367

RAWWORD <- bind_cols(RAWWORD,naroww)

366

RAWWORD <- bind_cols(RAWWORD,naroww)

368

367

369

368

370

roALZna <- t(ALZDAT) %>%

369

roALZna <- t(ALZDAT) %>%

371

rownames(.) %>%

370

rownames(.) %>%

372

as.data.frame(.,stringsAsFactors = FALSE)

371

as.data.frame(.,stringsAsFactors = FALSE)

373

colnames(roALZna) <- "ID_REF"

372

colnames(roALZna) <- "ID_REF"

374

373

375

RAWDAT <- t(ALZDAT) %>%

374

RAWDAT <- t(ALZDAT) %>%

376

as.data.frame(.,stringsAsFactors = FALSE)

375

as.data.frame(.,stringsAsFactors = FALSE)

377

colnames(RAWDAT) <- NULL

376

colnames(RAWDAT) <- NULL

378

rownames(RAWDAT) <- NULL

377

rownames(RAWDAT) <- NULL

379

378

380

RAWDAT2 <- RAWDAT %>%

379

RAWDAT2 <- RAWDAT %>%

381

cbind(roALZna,.) %>%

380

cbind(roALZna,.) %>%

382

dplyr::arrange(.,ID_REF)

381

dplyr::arrange(.,ID_REF)

383

382

384

##Editing the file for R processing

383

##Editing the file for R processing

385

RAWDATID <- RAWDAT2[,1] %>%

384

RAWDATID <- RAWDAT2[,1] %>%

386

as.matrix(.)

385

as.matrix(.)

387

386

388

RAWDATNUM <- RAWDAT2[,-1] %>%

387

RAWDATNUM <- RAWDAT2[,-1] %>%

389

mapply(.,FUN = as.numeric) %>%

388

mapply(.,FUN = as.numeric) %>%

390

t(.)

389

t(.)

391

390

392

##Consolidating genes with the same name

391

##Consolidating genes with the same name

393

###create empty matrix of size equal to tabRDATID

392

###create empty matrix of size equal to tabRDATID

394

tabRDATID <- table(RAWDATID)

393

tabRDATID <- table(RAWDATID)

395

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

394

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

396

j <- 1

395

j <- 1

397

for(j in 1:length(tabRDATID)){

396

for(j in 1:length(tabRDATID)){

398

##Putting the ones without duplicates in their new homes

397

##Putting the ones without duplicates in their new homes

399

if(tabRDATID[j] == 1){

398

if(tabRDATID[j] == 1){

400

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

399

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

401

} else if(tabRDATID[j] > 1){

400

} else if(tabRDATID[j] > 1){

402

##Averaging duplicates and putting them in their new homes

401

##Averaging duplicates and putting them in their new homes

403

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

402

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

404

}

403

}

405

j <- j + 1

404

j <- j + 1

406

}

405

}

407

406

408

407

409

##Outputting non Z-score Average over genes

408

##Outputting non Z-score Average over genes

410

newoutput <-NuRDATN

409

newoutput <-NuRDATN

411

colnames(newoutput) <- rownames(tabRDATID)

410

colnames(newoutput) <- rownames(tabRDATID)

412

nfnewout <- strsplit(alz,"[\\]") %>%

411

nfnewout <- strsplit(alz,"[\\|/]") %>%

413

.[[1]] %>%

412

.[[1]] %>%

414

.[length(.)] %>%

413

.[length(.)] %>%

415

gsub("\\D","",.) %>%

414

gsub("\\D","",.) %>%

416

c("GSE",.,"avg.txt") %>%

415

c("GSE",.,"avg.txt") %>%

417

paste(collapse = "")

416

paste(collapse = "")

418

noutput <- newoutput %>%

417

noutput <- newoutput %>%

419

t()%>%

418

t()%>%

420

as.data.frame(.,stringsAsFactors = FALSE)

419

as.data.frame(.,stringsAsFactors = FALSE)

421

noutput <- cbind(rownames(noutput),noutput)

420

noutput <- cbind(rownames(noutput),noutput)

422

colnames(noutput) <- c("Gene Symbol",subjnam)

421

colnames(noutput) <- c("Gene Symbol",subjnam)

423

rownames(noutput) <- NULL

422

rownames(noutput) <- NULL

424

write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)

423

write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)

425

424

426

425

427

##Scaling the Data

426

##Scaling the Data

428

scrawdat <- NuRDATN%>%

427

scrawdat <- NuRDATN%>%

429

scale()

428

scale()

430

attr(scrawdat,"scaled:center") <- NULL

429

attr(scrawdat,"scaled:center") <- NULL

431

attr(scrawdat,"scaled:scale") <- NULL

430

attr(scrawdat,"scaled:scale") <- NULL

432

colnames(scrawdat) <- rownames(tabRDATID)

431

colnames(scrawdat) <- rownames(tabRDATID)

433

432

434

#Outputting the Z-score file

433

#Outputting the Z-score file

435

nfnzsc <- strsplit(alz,"[\\]") %>%

434

nfnzsc <- strsplit(alz,"[\\|/]") %>%

436

.[[1]] %>%

435

.[[1]] %>%

437

.[length(.)] %>%

436

.[length(.)] %>%

438

gsub("\\D","",.) %>%

437

gsub("\\D","",.) %>%

439

c("GSE",.,"zscore.txt") %>%

438

c("GSE",.,"zscore.txt") %>%

440

paste(collapse = "")

439

paste(collapse = "")

441

zscraw <- scrawdat %>%

440

zscraw <- scrawdat %>%

442

t()%>%

441

t()%>%

443

as.data.frame(.,stringsAsFactors = FALSE)

442

as.data.frame(.,stringsAsFactors = FALSE)

444

zscraw <- cbind(rownames(zscraw),zscraw)

443

zscraw <- cbind(rownames(zscraw),zscraw)

445

colnames(zscraw) <- c("Gene Symbol",subjnam)

444

colnames(zscraw) <- c("Gene Symbol",subjnam)

446

rownames(zscraw) <- NULL

445

rownames(zscraw) <- NULL

447

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)

446

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)

448

447

449

448

450

##Discretized the Data

449

##Discretized the Data

451

dialzdat <- scrawdat %>%

450

dialzdat <- scrawdat %>%

452

dndat(.) %>%

451

dndat(.) %>%

453

t()%>%

452

t()%>%

454

as.data.frame(.,stringsAsFactors = FALSE)

453

as.data.frame(.,stringsAsFactors = FALSE)

455

colnames(dialzdat) <- rownames(RAWDATNUM)

454

colnames(dialzdat) <- rownames(RAWDATNUM)

456

455

457

##setting "ID_REF" as a new variable

456

##setting "ID_REF" as a new variable

458

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

457

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

459

colnames(geneNAM) <- "ID_REF"

458

colnames(geneNAM) <- "ID_REF"

460

rownames(dialzdat) <- NULL

459

rownames(dialzdat) <- NULL

461

dialzdat <-bind_cols(geneNAM,dialzdat)

460

dialzdat <-bind_cols(geneNAM,dialzdat)

462

461

463

##NAs in a column

462

##NAs in a column

464

x <- 2

463

x <- 2

465

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

464

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

466

nacol[1,1] = "COL_NAs"

465

nacol[1,1] = "COL_NAs"

467

for(x in 2:dim(dialzdat)[2]){

466

for(x in 2:dim(dialzdat)[2]){

468

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

467

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

469

x <- x + 1

468

x <- x + 1

470

}

469

}

471

colnames(nacol) <- colnames(dialzdat)

470

colnames(nacol) <- colnames(dialzdat)

472

dialzdat <- bind_rows(dialzdat,nacol)

471

dialzdat <- bind_rows(dialzdat,nacol)

473

472

474

##NAs in a row

473

##NAs in a row

475

y <- 1

474

y <- 1

476

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

475

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

477

for(y in 1:dim(dialzdat)[1]){

476

for(y in 1:dim(dialzdat)[1]){

478

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

477

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

479

y <- y + 1

478

y <- y + 1

480

}

479

}

481

colnames(narowd) <- "ROW_NAs"

480

colnames(narowd) <- "ROW_NAs"

482

dialzdat <- bind_cols(dialzdat,narowd)

481

dialzdat <- bind_cols(dialzdat,narowd)

483

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

482

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

484

colnames(RAWWORD) <- colnames(dialzdat)

483

colnames(RAWWORD) <- colnames(dialzdat)

485

##converting to character so that the clinical can be brought together with discrete data

484

##converting to character so that the clinical can be brought together with discrete data

486

k <- 2

485

k <- 2

487

for(k in 2:dim(dialzdat)[2]-1){

486

for(k in 2:dim(dialzdat)[2]-1){

488

dialzdat[,k] <- as.character(dialzdat[,k])

487

dialzdat[,k] <- as.character(dialzdat[,k])

489

k <- k + 1

488

k <- k + 1

490

}

489

}

491

#The End the full data

490

#The End the full data

492

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

491

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

493

492

494

#Produces Discrete file

493

#Produces Discrete file

495

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

494

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

496

.[[1]] %>%

495

.[[1]] %>%

497

.[length(.)] %>%

496

.[length(.)] %>%

498

gsub("\\D","",.) %>%

497

gsub("\\D","",.) %>%

499

c("GSE",.,"dscrt.txt") %>%

498

c("GSE",.,"dscrt.txt") %>%

500

paste(collapse = "")

499

paste(collapse = "")

501

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

500

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

502

n <- n +1

501

n <- n +1

503

}

502

}

504

} else if(numDAT == 2){

503

} else if(numDAT == 2){

505

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

504

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

506

505

507

#All the files you want to analyze

506

#All the files you want to analyze

508

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

507

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

509

if(length(ANDIS) == 0){

508

if(length(ANDIS) == 0){

510

#Spit out a warning

509

#Spit out a warning

511

warning("You did not select any files and so no cleaning will be performed")

510

warning("You did not select any files and so no cleaning will be performed")

512

} else{

511

} else{

513

#indexing the data files

512

#indexing the data files

514

n <- 1

513

n <- 1

515

for(n in 1: length(ANDIS)){

514

for(n in 1: length(ANDIS)){

516

alz <- ANDIS[n]

515

alz <- ANDIS[n]

517

516

518

#Working with the wordy part of the document

517

#Working with the wordy part of the document

519

alzword <- alz %>%

518

alzword <- alz %>%

520

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

519

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

521

filter(grepl("!Sample",X1))%>%

520

filter(grepl("!Sample",X1))%>%

522

filter(!grepl("!Sample_contact",X1))

521

filter(!grepl("!Sample_contact",X1))

523

522

524

#Getting the GPL file

523

#Getting the GPL file

525

genena <- grep("_platform_id",alzword$X1) %>%

524

genena <- grep("_platform_id",alzword$X1) %>%

526

alzword$X2[.] %>%

525

alzword$X2[.] %>%

527

str_trim(.) %>%

526

str_trim(.) %>%

528

paste0("^",.,"\\D") %>%

527

paste0("^",.,"\\D") %>%

529

grep(.,list.files()) %>%

528

grep(.,list.files()) %>%

530

list.files()[.]

529

list.files()[.]

531

530

532

#Find out if it is a soft GPL file or not

531

#Find out if it is a soft GPL file or not

533

soft <- strsplit(genena,"[\\|/]") %>%

532

soft <- strsplit(genena,"[\\|/]") %>%

534

.[[1]] %>%

533

.[[1]] %>%

535

.[length(.)] %>%

534

.[length(.)] %>%

536

grepl("soft",.)

535

grepl("soft",.)

537

536

538

##Changing row names and column names:

537

##Changing row names and column names:

539

ALZWORD <- t(alzword)

538

ALZWORD <- t(alzword)

540

rownames(ALZWORD)=NULL

539

rownames(ALZWORD)=NULL

541

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

540

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

542

ALZWORD <- chngrownm(ALZWORD)[-1,]

541

ALZWORD <- chngrownm(ALZWORD)[-1,]

543

ALZWORD <- ALZWORD%>%

542

ALZWORD <- ALZWORD%>%

544

as.data.frame(.,stringsAsFactors = FALSE)%>%

543

as.data.frame(.,stringsAsFactors = FALSE)%>%

545

dplyr::select(-starts_with("col"))

544

dplyr::select(-starts_with("col"))

546

545

547

##Reorganizing information within the columns and final clinical data

546

##Reorganizing information within the columns and final clinical data

548

ALZWORDF <- cinfo(ALZWORD)

547

ALZWORDF <- cinfo(ALZWORD)

549

548

550

549

551

#Working with Actual Data part of file

550

#Working with Actual Data part of file

552

alzdat <- alz %>%

551

alzdat <- alz %>%

553

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

552

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

554

ALZDAT <- t(alzdat[,-1])

553

ALZDAT <- t(alzdat[,-1])

555

rownames(ALZDAT)=NULL

554

rownames(ALZDAT)=NULL

556

555

557

##Is there a clean version of the GPL file available?

556

##Is there a clean version of the GPL file available?

558

gplnum <- strsplit(genena,"[\\|/]") %>%

557

gplnum <- strsplit(genena,"[\\|/]") %>%

559

.[[1]] %>%

558

.[[1]] %>%

560

.[length(.)] %>%

559

.[length(.)] %>%

561

gsub("\\D","",.)

560

gsub("\\D","",.)

562

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

561

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

563

if(clfileex >= 1){

562

if(clfileex >= 1){

564

#use the clean version

563

#use the clean version

565

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

564

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

566

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

565

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

567

566

568

} else if(clfileex == 0){

567

} else if(clfileex == 0){

569

##Lets Create a clean version

568

##Lets Create a clean version

570

569

571

##Gene ID to Gene Name

570

##Gene ID to Gene Name

572

if(soft == TRUE){

571

if(soft == TRUE){

573

#Check to see if there is already a file containing information on soft files

572

#Check to see if there is already a file containing information on soft files

574

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

573

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

575

if(fileex == 1){

574

if(fileex == 1){

576

#Check to see if this GPL soft file has been used before

575

#Check to see if this GPL soft file has been used before

577

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

576

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

578

.$GPL_FILE_NUM%>%

577

.$GPL_FILE_NUM%>%

579

grepl(gplnum,.) %>%

578

grepl(gplnum,.) %>%

580

sum()

579

sum()

581

if(IDF == 1){

580

if(IDF == 1){

582

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

581

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

583

.$GPL_FILE_NUM%>%

582

.$GPL_FILE_NUM%>%

584

grep(gplnum,.)

583

grep(gplnum,.)

585

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

584

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

586

.$LOC_ID %>%

585

.$LOC_ID %>%

587

.[IDLOCAL]

586

.[IDLOCAL]

588

geneIDNam <- genena %>%

587

geneIDNam <- genena %>%

589

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

588

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

590

589

591

} else if(IDF == 0){

590

} else if(IDF == 0){

592

#No information on this particular GPL file

591

#No information on this particular GPL file

593

idLOCGPL <- genena %>%

592

idLOCGPL <- genena %>%

594

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

593

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

595

t(.) %>%

594

t(.) %>%

596

grep("^ID\\s*$",.) %>%

595

grep("^ID\\s*$",.) %>%

597

-1

596

-1

598

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

597

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

599

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

598

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

600

geneIDNam <- genena %>%

599

geneIDNam <- genena %>%

601

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

600

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

602

601

603

}

602

}

604

} else if(fileex == 0){

603

} else if(fileex == 0){

605

#We must create a file that we can access for later use

604

#We must create a file that we can access for later use

606

idLOCGPL <- genena %>%

605

idLOCGPL <- genena %>%

607

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

606

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

608

t(.) %>%

607

t(.) %>%

609

grep("^ID\\s*$",.) %>%

608

grep("^ID\\s*$",.) %>%

610

-1

609

-1

611

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

610

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

612

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

611

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

613

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

612

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

614

geneIDNam <- genena %>%

613

geneIDNam <- genena %>%

615

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

614

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

616

615

617

}

616

}

618

} else if(soft == FALSE){

617

} else if(soft == FALSE){

619

geneIDNam <- genena %>%

618

geneIDNam <- genena %>%

620

read_delim(delim="\t",comment = "#")%>%

619

read_delim(delim="\t",comment = "#")%>%

621

620

622

}

621

}

623

622

624

##Labeling the gene IDs without names

623

##Labeling the gene IDs without names

625

geneIDNam <- NAFIXING(geneIDNam)

624

geneIDNam <- NAFIXING(geneIDNam)

626

625

627

##remove the whitespace

626

##remove the whitespace

628

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

627

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

629

628

630

##Here is the clean version

629

##Here is the clean version

631

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

630

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

632

}

631

}

633

632

634

633

635

634

636

##Changing the gene ID to gene name

635

##Changing the gene ID to gene name

637

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

636

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

638

colnames(ALZDAT) = ALZDAT1[1,]

637

colnames(ALZDAT) = ALZDAT1[1,]

639

638

640

639

641

##Adjusting the column names aka the gene names

640

##Adjusting the column names aka the gene names

642

colnames(ALZDAT) <- gcnames(ALZDAT)

641

colnames(ALZDAT) <- gcnames(ALZDAT)

643

642

644

643

645

#Full RAW Data

644

#Full RAW Data

646

Fullalzdwr <- ALZDAT %>%

645

Fullalzdwr <- ALZDAT %>%

647

as.data.frame(.,stringsAsFactors = FALSE) %>%

646

as.data.frame(.,stringsAsFactors = FALSE) %>%

648

cbind(ALZWORDF,.)

647

cbind(ALZWORDF,.)

649

648

650

#Raw file is output

649

#Raw file is output

651

nfnaex <- strsplit(alz,"[\\]") %>%

650

nfnaex <- strsplit(alz,"[\\|/]") %>%

652

.[[1]] %>%

651

.[[1]] %>%

653

.[length(.)] %>%

652

.[length(.)] %>%

654

gsub("\\D","",.) %>%

653

gsub("\\D","",.) %>%

655

c("GSE",.,"aftexcel.txt") %>%

654

c("GSE",.,"aftexcel.txt") %>%

656

paste(collapse = "")

655

paste(collapse = "")

657

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

656

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

658

657

659

658

660

659

661

#Now for the discretization part

660

#Now for the discretization part

662

##get the wordy part again

661

##get the wordy part again

663

rawword <- t(ALZWORDF)

662

rawword <- t(ALZWORDF)

664

663

665

##where is ID_REF located

664

##where is ID_REF located

666

hereim <- grep("ID_REF",rownames(rawword))

665

hereim <- grep("ID_REF",rownames(rawword))

667

666

668

##Subject Names GSM...

667

##Subject Names GSM...

669

subjnam <- rawword[hereim,]

668

subjnam <- rawword[hereim,]

670

669

671

##Getting the names for the rows

670

##Getting the names for the rows

672

namedarows <- rownames(rawword)[-hereim] %>%

671

namedarows <- rownames(rawword)[-hereim] %>%

673

as.data.frame(.,stringsAsFactors = FALSE)

672

as.data.frame(.,stringsAsFactors = FALSE)

674

RAWWORD <- rawword[-hereim,] %>%

673

RAWWORD <- rawword[-hereim,] %>%

675

as.data.frame(.,stringsAsFactors = FALSE) %>%

674

as.data.frame(.,stringsAsFactors = FALSE) %>%

676

bind_cols(namedarows,.)

675

bind_cols(namedarows,.)

677

z <- 1

676

z <- 1

678

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

677

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

679

for(z in 1:dim(RAWWORD)[1]){

678

for(z in 1:dim(RAWWORD)[1]){

680

if(sum(is.na(RAWWORD[z,])) > 0){

679

if(sum(is.na(RAWWORD[z,])) > 0){

681

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

680

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

682

}

681

}

683

if(length(grep("NA",RAWWORD[z,])) > 0){

682

if(length(grep("NA",RAWWORD[z,])) > 0){

684

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

683

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

685

}

684

}

686

z <- z + 1

685

z <- z + 1

687

}

686

}

688

687

689

colnames(naroww) <- "ROW_NAs"

688

colnames(naroww) <- "ROW_NAs"

690

RAWWORD <- bind_cols(RAWWORD,naroww)

689

RAWWORD <- bind_cols(RAWWORD,naroww)

691

690

692

691

693

roALZna <- t(ALZDAT) %>%

692

roALZna <- t(ALZDAT) %>%

694

rownames(.) %>%

693

rownames(.) %>%

695

as.data.frame(.,stringsAsFactors = FALSE)

694

as.data.frame(.,stringsAsFactors = FALSE)

696

colnames(roALZna) <- "ID_REF"

695

colnames(roALZna) <- "ID_REF"

697

696

698

RAWDAT <- t(ALZDAT) %>%

697

RAWDAT <- t(ALZDAT) %>%

699

as.data.frame(.,stringsAsFactors = FALSE)

698

as.data.frame(.,stringsAsFactors = FALSE)

700

colnames(RAWDAT) <- NULL

699

colnames(RAWDAT) <- NULL

701

rownames(RAWDAT) <- NULL

700

rownames(RAWDAT) <- NULL

702

701

703

RAWDAT2 <- RAWDAT %>%

702

RAWDAT2 <- RAWDAT %>%

704

cbind(roALZna,.) %>%

703

cbind(roALZna,.) %>%

705

dplyr::arrange(.,ID_REF)

704

dplyr::arrange(.,ID_REF)

706

705

707

##Editing the file for R processing

706

##Editing the file for R processing

708

RAWDATID <- RAWDAT2[,1] %>%

707

RAWDATID <- RAWDAT2[,1] %>%

709

as.matrix(.)

708

as.matrix(.)

710

709

711

RAWDATNUM <- RAWDAT2[,-1] %>%

710

RAWDATNUM <- RAWDAT2[,-1] %>%

712

mapply(.,FUN = as.numeric) %>%

711

mapply(.,FUN = as.numeric) %>%

713

t(.)

712

t(.)

714

713

715

##Consolidating genes with the same name

714

##Consolidating genes with the same name

716

###create empty matrix of size equal to tabRDATID

715

###create empty matrix of size equal to tabRDATID

717

tabRDATID <- table(RAWDATID)

716

tabRDATID <- table(RAWDATID)

718

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

717

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

719

j <- 1

718

j <- 1

720

for(j in 1:length(tabRDATID)){

719

for(j in 1:length(tabRDATID)){

721

##Putting the ones without duplicates in their new homes

720

##Putting the ones without duplicates in their new homes

722

if(tabRDATID[j] == 1){

721

if(tabRDATID[j] == 1){

723

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

722

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

724

} else if(tabRDATID[j] > 1){

723

} else if(tabRDATID[j] > 1){

725

##Averaging duplicates and putting them in their new homes

724

##Averaging duplicates and putting them in their new homes

726

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

725

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

727

}

726

}

728

j <- j + 1

727

j <- j + 1

729

}

728

}

730

729

731

##Outputting non Z-score Average over genes

730

##Outputting non Z-score Average over genes

732

newoutput <-NuRDATN

731

newoutput <-NuRDATN

733

colnames(newoutput) <- rownames(tabRDATID)

732

colnames(newoutput) <- rownames(tabRDATID)

734

nfnewout <- strsplit(alz,"[\\]") %>%

733

nfnewout <- strsplit(alz,"[\\|/]") %>%

735

.[[1]] %>%

734

.[[1]] %>%

736

.[length(.)] %>%

735

.[length(.)] %>%

737

gsub("\\D","",.) %>%

736

gsub("\\D","",.) %>%

738

c("GSE",.,"avg.txt") %>%

737

c("GSE",.,"avg.txt") %>%

739

paste(collapse = "")

738

paste(collapse = "")

740

noutput <- newoutput %>%

739

noutput <- newoutput %>%

741

t()%>%

740

t()%>%

742

as.data.frame(.,stringsAsFactors = FALSE)

741

as.data.frame(.,stringsAsFactors = FALSE)

743

noutput <- cbind(rownames(noutput),noutput)

742

noutput <- cbind(rownames(noutput),noutput)

744

colnames(noutput) <- c("Gene Symbol",subjnam)

743

colnames(noutput) <- c("Gene Symbol",subjnam)

745

rownames(noutput) <- NULL

744

rownames(noutput) <- NULL

746

write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)

745

write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)

747

746

748

747

749

##Scaling the Data

748

##Scaling the Data

750

scrawdat <- NuRDATN%>%

749

scrawdat <- NuRDATN%>%

751

scale()

750

scale()

752

attr(scrawdat,"scaled:center") <- NULL

751

attr(scrawdat,"scaled:center") <- NULL

753

attr(scrawdat,"scaled:scale") <- NULL

752

attr(scrawdat,"scaled:scale") <- NULL

754

colnames(scrawdat) <- rownames(tabRDATID)

753

colnames(scrawdat) <- rownames(tabRDATID)

755

754

756

#Outputting the Z-score file

755

#Outputting the Z-score file

757

nfnzsc <- strsplit(alz,"[\\]") %>%

756

nfnzsc <- strsplit(alz,"[\\|/]") %>%

758

.[[1]] %>%

757

.[[1]] %>%

759

.[length(.)] %>%

758

.[length(.)] %>%

760

gsub("\\D","",.) %>%

759

gsub("\\D","",.) %>%

761

c("GSE",.,"zscore.txt") %>%

760

c("GSE",.,"zscore.txt") %>%

762

paste(collapse = "")

761

paste(collapse = "")

763

zscraw <- scrawdat %>%

762

zscraw <- scrawdat %>%

764

t()%>%

763

t()%>%

765

as.data.frame(.,stringsAsFactors = FALSE)

764

as.data.frame(.,stringsAsFactors = FALSE)

766

zscraw <- cbind(rownames(zscraw),zscraw)

765

zscraw <- cbind(rownames(zscraw),zscraw)

767

colnames(zscraw) <- c("Gene Symbol",subjnam)

766

colnames(zscraw) <- c("Gene Symbol",subjnam)

768

rownames(zscraw) <- NULL

767

rownames(zscraw) <- NULL

769

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)

768

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)

770

769

771

##Discretized the Data

770

##Discretized the Data

772

dialzdat <- scrawdat %>%

771

dialzdat <- scrawdat %>%

773

dndat(.) %>%

772

dndat(.) %>%

774

t()%>%

773

t()%>%

775

as.data.frame(.,stringsAsFactors = FALSE)

774

as.data.frame(.,stringsAsFactors = FALSE)

776

colnames(dialzdat) <- rownames(RAWDATNUM)

775

colnames(dialzdat) <- rownames(RAWDATNUM)

777

776

778

##setting "ID_REF" as a new variable

777

##setting "ID_REF" as a new variable

779

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

778

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

780

colnames(geneNAM) <- "ID_REF"

779

colnames(geneNAM) <- "ID_REF"

781

rownames(dialzdat) <- NULL

780

rownames(dialzdat) <- NULL

782

dialzdat <-bind_cols(geneNAM,dialzdat)

781

dialzdat <-bind_cols(geneNAM,dialzdat)

783

782

784

##NAs in a column

783

##NAs in a column

785

x <- 2

784

x <- 2

786

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

785

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

787

nacol[1,1] = "COL_NAs"

786

nacol[1,1] = "COL_NAs"

788

for(x in 2:dim(dialzdat)[2]){

787

for(x in 2:dim(dialzdat)[2]){

789

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

788

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

790

x <- x + 1

789

x <- x + 1

791

}

790

}

792

colnames(nacol) <- colnames(dialzdat)

791

colnames(nacol) <- colnames(dialzdat)

793

dialzdat <- bind_rows(dialzdat,nacol)

792

dialzdat <- bind_rows(dialzdat,nacol)

794

793

795

##NAs in a row

794

##NAs in a row

796

y <- 1

795

y <- 1

797

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

796

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

798

for(y in 1:dim(dialzdat)[1]){

797

for(y in 1:dim(dialzdat)[1]){

799

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

798

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

800

y <- y + 1

799

y <- y + 1

801

}

800

}

802

colnames(narowd) <- "ROW_NAs"

801

colnames(narowd) <- "ROW_NAs"

803

dialzdat <- bind_cols(dialzdat,narowd)

802

dialzdat <- bind_cols(dialzdat,narowd)

804

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

803

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

805

colnames(RAWWORD) <- colnames(dialzdat)

804

colnames(RAWWORD) <- colnames(dialzdat)

806

##converting to character so that the clinical can be brought together with discrete data

805

##converting to character so that the clinical can be brought together with discrete data

807

k <- 2

806

k <- 2

808

for(k in 2:dim(dialzdat)[2]-1){

807

for(k in 2:dim(dialzdat)[2]-1){

809

dialzdat[,k] <- as.character(dialzdat[,k])

808

dialzdat[,k] <- as.character(dialzdat[,k])

810

k <- k + 1

809

k <- k + 1

811

}

810

}

812

#The End the full data

811

#The End the full data

813

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

812

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

814

813

815

#Produces Discrete file

814

#Produces Discrete file

816

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

815

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

817

.[[1]] %>%

816

.[[1]] %>%

818

.[length(.)] %>%

817

.[length(.)] %>%

819

gsub("\\D","",.) %>%

818

gsub("\\D","",.) %>%

820

c("GSE",.,"dscrt.txt") %>%

819

c("GSE",.,"dscrt.txt") %>%

821

paste(collapse = "")

820

paste(collapse = "")

822

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

821

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

823

822

824

823

825

n <- n + 1

824

n <- n + 1

826

}

825

}

827

}

826

}

828

}

827

}

829

}

828

}

830

#The Rest of this code will be used every time you want to change a data set

829

#The Rest of this code will be used every time you want to change a data set

831

THEFT()

830

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Update