Efrain Gonzalez / Cleaning and Fixing Data with R

1

########################################################################

1

2

# Don't Use This Code Just Yet #

3

########################################################################

4

#Efrain H. Gonzalez

2

#Efrain H. Gonzalez

5

#6/21/2017

3

#6/22/2017

6

options(digits = 11)

4

options(digits = 11)

7

#Libraries required to run the code

5

#Libraries required to run the code

8

library(pryr)

6

library(pryr)

9

library(MASS)

7

library(MASS)

10

library(dplyr)

8

library(dplyr)

11

library(tidyr)

9

library(tidyr)

12

library(readr)

10

library(readr)

13

library(stringr)

11

library(stringr)

14

12

15

13

16

#Necessary Functions

14

#Necessary Functions

17

#1#Function for handling the changing of row names and column names

15

#1#Function for handling the changing of row names and column names

18

chngrownm <- function(mat){

16

chngrownm <- function(mat){

19

row <- dim(mat)[1]

17

row <- dim(mat)[1]

20

col <- dim(mat)[2]

18

col <- dim(mat)[2]

21

e <- 1

19

e <- 1

22

r <- 1

20

r <- 1

23

a <- 1

21

a <- 1

24

h <- 1

22

h <- 1

25

g <- 1

23

g <- 1

26

o <- 1

24

o <- 1

27

for(e in 1:col){

25

for(e in 1:col){

28

if("!Sample_source_name_ch1"==mat[1,e]){

26

if("!Sample_source_name_ch1"==mat[1,e]){

29

colnames(mat)[e] <- "Brain_Region"

27

colnames(mat)[e] <- "Brain_Region"

30

} else if("!Sample_title" == mat[1,e]){

28

} else if("!Sample_title" == mat[1,e]){

31

colnames(mat)[e] <- "Title"

29

colnames(mat)[e] <- "Title"

32

} else if("!Sample_geo_accession" == mat[1,e]){

30

} else if("!Sample_geo_accession" == mat[1,e]){

33

colnames(mat)[e] <- "ID_REF"

31

colnames(mat)[e] <- "ID_REF"

34

} else{

32

} else{

35

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

33

if(grepl("Sex|gender|Gender|sex",mat[2,e])==TRUE){

36

colnames(mat)[e] <- paste0("Sex",r)

34

colnames(mat)[e] <- paste0("Sex",r)

37

r = r + 1

35

r = r + 1

38

}

36

}

39

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

37

if(grepl("postmorteminterval|PMI|pmi|interval",mat[2,e])==TRUE){

40

colnames(mat)[e] <- paste0("PMI",a)

38

colnames(mat)[e] <- paste0("PMI",a)

41

a = a + 1

39

a = a + 1

42

}

40

}

43

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

41

if(grepl("age|Age|AGE",mat[2,e])==TRUE){

44

colnames(mat)[e] <- paste0("Age",h)

42

colnames(mat)[e] <- paste0("Age",h)

45

h = h + 1

43

h = h + 1

46

}

44

}

47

if(grepl("braak|b&b",mat[2,e])==TRUE){

45

if(grepl("braak|b&b",mat[2,e])==TRUE){

48

colnames(mat)[e] <- paste0("Braak",g)

46

colnames(mat)[e] <- paste0("Braak",g)

49

g = g + 1

47

g = g + 1

50

}

48

}

51

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

49

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,e])==TRUE){

52

colnames(mat)[e] <- paste0("Group",o)

50

colnames(mat)[e] <- paste0("Group",o)

53

o = o + 1

51

o = o + 1

54

}

52

}

55

53

56

}

54

}

57

e = e + 1

55

e = e + 1

58

}

56

}

59

mat

57

mat

60

}

58

}

61

59

62

#2#Function for reorganizing information within the columns

60

#2#Function for reorganizing information within the columns

63

cinfo <- function(mat){

61

cinfo <- function(mat){

64

col <- dim(mat)[2]

62

col <- dim(mat)[2]

65

j <-2

63

j <-2

66

for(j in 2:col){

64

for(j in 2:col){

67

if(grepl("Group",colnames(mat)[j]) == TRUE){

65

if(grepl("Group",colnames(mat)[j]) == TRUE){

68

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

66

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

69

} else if(grepl("Age",colnames(mat)[j])==TRUE){

67

} else if(grepl("Age",colnames(mat)[j])==TRUE){

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

71

as.integer()

69

as.integer()

72

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

70

} else if(grepl("Sex",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

74

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

72

} else if(grepl("PMI",colnames(mat)[j])==TRUE){

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

73

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

76

as.numeric()

74

as.numeric()

77

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

75

} else if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

76

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

77

as.roman()%>%

80

as.integer()

78

as.integer()

81

}

79

}

82

j=j+1

80

j=j+1

83

}

81

}

84

mat

82

mat

85

}

83

}

86

84

87

#3#Function for labeling the gene IDs without names

85

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

86

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

87

row <- dim(GIDNAM)[1]

90

i <- 1

88

i <- 1

91

for(i in 1:row){

89

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

90

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

91

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

92

}

95

i <- i + 1

93

i <- i + 1

96

}

94

}

97

GIDNAM

95

GIDNAM

98

}

96

}

99

97

100

#4#Function for changing the gene ID to gene name

98

#4#Function for changing the gene ID to gene name

101

cgeneID <- function(GeneName,DATA){

99

cgeneID <- function(GeneName,DATA){

102

nj <- t(GeneName)

100

nj <- t(GeneName)

103

nq <- t(DATA)

101

nq <- t(DATA)

104

colGene <- dim(nj)[2]

102

colGene <- dim(nj)[2]

105

colDATA <- dim(nq)[2]

103

colDATA <- dim(nq)[2]

106

j <- 1

104

j <- 1

107

for(j in 1:colDATA){

105

for(j in 1:colDATA){

108

#where is that gene id located within the GPL file

106

#where is that gene id located within the GPL file

109

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

107

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

110

if(is.na(sum(chngreq))==FALSE){

108

if(is.na(sum(chngreq))==FALSE){

111

if(sum(chngreq) > 0){

109

if(sum(chngreq) > 0){

112

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

110

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

113

}

111

}

114

}

112

}

115

j <- j + 1

113

j <- j + 1

116

}

114

}

117

nq

115

nq

118

}

116

}

119

#cgeneID <- function(GeneName,DATA){

117

#cgeneID <- function(GeneName,DATA){

120

# colGene <- dim(GeneName)[2]

118

# colGene <- dim(GeneName)[2]

121

# j <- 1

119

# j <- 1

122

# for(j in 1:colGene){

120

# for(j in 1:colGene){

123

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

121

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

124

# if(is.na(sum(chngsreq))==FALSE){

122

# if(is.na(sum(chngsreq))==FALSE){

125

# if(sum(chngsreq) > 0){

123

# if(sum(chngsreq) > 0){

126

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

124

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

# }

125

# }

128

# }

126

# }

129

# j = j+1

127

# j = j+1

130

# }

128

# }

131

# DATA

129

# DATA

132

#}

130

#}

133

131

134

#5#Function for adjusting the gene names

132

#5#Function for adjusting the gene names

135

gcnames <- function(DiData,usecol=1){

133

gcnames <- function(DiData,usecol=1){

136

nuruns <- dim(DiData)[2]

134

nuruns <- dim(DiData)[2]

137

i = 1

135

i = 1

138

nwnam <- rep("0",length.out=nuruns)

136

nwnam <- rep("0",length.out=nuruns)

139

for(i in 1:nuruns){

137

for(i in 1:nuruns){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

138

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

139

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

142

} else{

140

} else{

143

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

141

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

144

}

142

}

145

143

146

}

144

}

147

nwnam

145

nwnam

148

146

149

}

147

}

150

148

151

#6# Function for discretizing the data

149

#6# Function for discretizing the data

152

dndat <- function(NDATA){

150

dndat <- function(NDATA){

153

rownd <- dim(NDATA)[1]

151

rownd <- dim(NDATA)[1]

154

colnd <- dim(NDATA)[2]

152

colnd <- dim(NDATA)[2]

155

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

153

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

156

colnames(DDATA) <- colnames(NDATA)

154

colnames(DDATA) <- colnames(NDATA)

157

i <- 1

155

i <- 1

158

for(i in 1:rownd){

156

for(i in 1:rownd){

159

j <- 1

157

j <- 1

160

for(j in 1:colnd){

158

for(j in 1:colnd){

161

if(is.na(NDATA[i,j])==FALSE){

159

if(is.na(NDATA[i,j])==FALSE){

162

160

163

if(NDATA[i,j] < -1){

161

if(NDATA[i,j] < -1){

164

DDATA[i,j]=0L

162

DDATA[i,j]=0L

165

} else if(NDATA[i,j] > 1){

163

} else if(NDATA[i,j] > 1){

166

DDATA[i,j]=2L

164

DDATA[i,j]=2L

167

} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

165

} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

168

DDATA[i,j]=1L

166

DDATA[i,j]=1L

169

}

167

}

170

} else{

168

} else{

171

DDATA[i,j] = NDATA[i,j]

169

DDATA[i,j] = NDATA[i,j]

172

}

170

}

173

j = j + 1

171

j = j + 1

174

}

172

}

175

i = i + 1

173

i = i + 1

176

}

174

}

177

DDATA

175

DDATA

178

}

176

}

179

177

180

178

181

#MajorFunction#This is the function that does everything else

179

#MajorFunction#This is the function that does everything else

182

THEFT <- function(){

180

THEFT <- function(){

183

#Set working directory based on the directory of the series matrix file Currently only works for windows

181

#Set working directory based on the directory of the series matrix file Currently only works for windows

184

wd <- getwd()

182

wd <- getwd()

185

#list.files()

183

#list.files()

186

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

184

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

187

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

185

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

188

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

186

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

189

GSEfloc <- list.files()[GSEfileloc]

187

GSEfloc <- list.files()[GSEfileloc]

190

#ALL DATA FILES WILL BE CLEANED

188

#ALL DATA FILES WILL BE CLEANED

191

if(numDAT == 1){

189

if(numDAT == 1){

192

#indexing the data files

190

#indexing the data files

193

n <- 1

191

n <- 1

194

for(n in 1: length(GSEfloc)){

192

for(n in 1: length(GSEfloc)){

195

alz <- GSEfloc[n]

193

alz <- GSEfloc[n]

196

194

197

#Working with the wordy part of the document

195

#Working with the wordy part of the document

198

alzword <- alz %>%

196

alzword <- alz %>%

199

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

197

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

200

filter(grepl("!Sample",X1))%>%

198

filter(grepl("!Sample",X1))%>%

201

filter(!grepl("!Sample_contact",X1))

199

filter(!grepl("!Sample_contact",X1))

202

200

203

#Getting the GPL file

201

#Getting the GPL file

204

genena <- grep("_platform_id",alzword$X1) %>%

202

genena <- grep("_platform_id",alzword$X1) %>%

205

alzword$X2[.] %>%

203

alzword$X2[.] %>%

206

str_trim(.) %>%

204

str_trim(.) %>%

207

paste0("^",.,"\\D") %>%

205

paste0("^",.,"\\D") %>%

208

grep(.,list.files()) %>%

206

grep(.,list.files()) %>%

209

list.files()[.]

207

list.files()[.]

210

208

211

#Find out if it is a soft GPL file or not

209

#Find out if it is a soft GPL file or not

212

soft <- strsplit(genena,"[\\|/]") %>%

210

soft <- strsplit(genena,"[\\|/]") %>%

213

.[[1]] %>%

211

.[[1]] %>%

214

.[length(.)] %>%

212

.[length(.)] %>%

215

grepl("soft",.)

213

grepl("soft",.)

216

214

217

##Changing row names and column names:

215

##Changing row names and column names:

218

ALZWORD <- t(alzword)

216

ALZWORD <- t(alzword)

219

rownames(ALZWORD)=NULL

217

rownames(ALZWORD)=NULL

220

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

218

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

221

ALZWORD <- chngrownm(ALZWORD)[-1,]

219

ALZWORD <- chngrownm(ALZWORD)[-1,]

222

ALZWORD <- ALZWORD%>%

220

ALZWORD <- ALZWORD%>%

223

as.data.frame()%>%

221

as.data.frame()%>%

224

dplyr::select(-starts_with("col"))

222

dplyr::select(-starts_with("col"))

225

223

226

##Reorganizing information within the columns and final clinical data

224

##Reorganizing information within the columns and final clinical data

227

ALZWORDF <- cinfo(ALZWORD)

225

ALZWORDF <- cinfo(ALZWORD)

228

226

229

227

230

#Working with Actual Data part of file

228

#Working with Actual Data part of file

231

alzdat <- alz %>%

229

alzdat <- alz %>%

232

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

230

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

233

ALZDAT <- t(alzdat[,-1])

231

ALZDAT <- t(alzdat[,-1])

234

rownames(ALZDAT)=NULL

232

rownames(ALZDAT)=NULL

235

233

236

##Is there a clean version of the GPL file available?

234

##Is there a clean version of the GPL file available?

237

gplnum <- strsplit(genena,"[\\|/]") %>%

235

gplnum <- strsplit(genena,"[\\|/]") %>%

238

.[[1]] %>%

236

.[[1]] %>%

239

.[length(.)] %>%

237

.[length(.)] %>%

240

gsub("\\D","",.)

238

gsub("\\D","",.)

241

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

239

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

242

if(clfileex >= 1){

240

if(clfileex >= 1){

243

#use the clean version

241

#use the clean version

244

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

242

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

245

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

243

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

246

244

247

} else if(clfileex == 0){

245

} else if(clfileex == 0){

248

##Lets Create a clean version

246

##Lets Create a clean version

249

247

250

##Gene ID to Gene Name

248

##Gene ID to Gene Name

251

if(soft == TRUE){

249

if(soft == TRUE){

252

#Check to see if there is already a file containing information on soft files

250

#Check to see if there is already a file containing information on soft files

253

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

251

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

254

if(fileex == 1){

252

if(fileex == 1){

255

#Check to see if this GPL soft file has been used before

253

#Check to see if this GPL soft file has been used before

256

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

254

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

257

.$GPL_FILE_NUM%>%

255

.$GPL_FILE_NUM%>%

258

grepl(gplnum,.) %>%

256

grepl(gplnum,.) %>%

259

sum()

257

sum()

260

if(IDF == 1){

258

if(IDF == 1){

261

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

259

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

262

.$GPL_FILE_NUM%>%

260

.$GPL_FILE_NUM%>%

263

grep(gplnum,.)

261

grep(gplnum,.)

264

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

262

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

265

.$LOC_ID %>%

263

.$LOC_ID %>%

266

.[IDLOCAL]

264

.[IDLOCAL]

267

geneIDNam <- genena %>%

265

geneIDNam <- genena %>%

268

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

266

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

269

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

267

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

270

} else if(IDF == 0){

268

} else if(IDF == 0){

271

#No information on this particular GPL file

269

#No information on this particular GPL file

272

idLOCGPL <- genena %>%

270

idLOCGPL <- genena %>%

273

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

271

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

274

t(.) %>%

272

t(.) %>%

275

grep("^ID\\s*$",.) %>%

273

grep("^ID\\s*$",.) %>%

276

-1

274

-1

277

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

275

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

278

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

276

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

279

geneIDNam <- genena %>%

277

geneIDNam <- genena %>%

280

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

278

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

281

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

279

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

282

}

280

}

283

} else if(fileex == 0){

281

} else if(fileex == 0){

284

#We must create a file that we can access for later use

282

#We must create a file that we can access for later use

285

idLOCGPL <- genena %>%

283

idLOCGPL <- genena %>%

286

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

284

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

287

t(.) %>%

285

t(.) %>%

288

grep("^ID\\s*$",.) %>%

286

grep("^ID\\s*$",.) %>%

289

-1

287

-1

290

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

288

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

291

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

289

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

292

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

290

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

293

geneIDNam <- genena %>%

291

geneIDNam <- genena %>%

294

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

292

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

295

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

293

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

296

}

294

}

297

} else if(soft == FALSE){

295

} else if(soft == FALSE){

298

geneIDNam <- genena %>%

296

geneIDNam <- genena %>%

299

read_delim(delim="\t",comment = "#")%>%

297

read_delim(delim="\t",comment = "#")%>%

300

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

298

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

301

}

299

}

302

300

303

##Labeling the gene IDs without names

301

##Labeling the gene IDs without names

304

geneIDNam <- NAFIXING(geneIDNam)

302

geneIDNam <- NAFIXING(geneIDNam)

305

303

306

##remove the whitespace

304

##remove the whitespace

307

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

305

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

308

306

309

##Here is the clean version

307

##Here is the clean version

310

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

308

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

311

}

309

}

312

310

313

311

314

312

315

##Changing the gene ID to gene name

313

##Changing the gene ID to gene name

316

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

314

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

317

colnames(ALZDAT) = ALZDAT1[1,]

315

colnames(ALZDAT) = ALZDAT1[1,]

318

316

319

317

320

##Adjusting the column names aka the gene names

318

##Adjusting the column names aka the gene names

321

colnames(ALZDAT) <- gcnames(ALZDAT)

319

colnames(ALZDAT) <- gcnames(ALZDAT)

322

320

323

321

324

#Full RAW Data

322

#Full RAW Data

325

Fullalzdwr <- ALZDAT %>%

323

Fullalzdwr <- ALZDAT %>%

326

as.data.frame() %>%

324

as.data.frame() %>%

327

cbind(ALZWORDF,.)

325

cbind(ALZWORDF,.)

328

326

329

#Raw file is output

327

#Raw file is output

330

nfnaex <- strsplit(alz,"[\\]") %>%

328

nfnaex <- strsplit(alz,"[\\]") %>%

331

.[[1]] %>%

329

.[[1]] %>%

332

.[length(.)] %>%

330

.[length(.)] %>%

333

gsub("\\D","",.) %>%

331

gsub("\\D","",.) %>%

334

c("GSE",.,"aftexcel.txt") %>%

332

c("GSE",.,"aftexcel.txt") %>%

335

paste(collapse = "")

333

paste(collapse = "")

336

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

334

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

337

335

338

336

339

337

340

#Now for the discretization part

338

#Now for the discretization part

341

##get the wordy part again

339

##get the wordy part again

342

rawword <- t(ALZWORDF)

340

rawword <- t(ALZWORDF)

343

341

344

##where is ID_REF located

342

##where is ID_REF located

345

hereim <- grep("ID_REF",rownames(rawword))

343

hereim <- grep("ID_REF",rownames(rawword))

346

344

347

##Subject Names GSM...

345

##Subject Names GSM...

348

subjnam <- rawword[hereim,]

346

subjnam <- rawword[hereim,]

349

347

350

##Getting the names for the rows

348

##Getting the names for the rows

351

namedarows <- rownames(rawword)[-hereim] %>%

349

namedarows <- rownames(rawword)[-hereim] %>%

352

as.data.frame()

350

as.data.frame()

353

RAWWORD <- rawword[-hereim,] %>%

351

RAWWORD <- rawword[-hereim,] %>%

354

as.data.frame() %>%

352

as.data.frame() %>%

355

bind_cols(namedarows,.)

353

bind_cols(namedarows,.)

356

z <- 1

354

z <- 1

357

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

355

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

358

for(z in 1:dim(RAWWORD)[1]){

356

for(z in 1:dim(RAWWORD)[1]){

359

if(sum(is.na(RAWWORD[z,])) > 0){

357

if(sum(is.na(RAWWORD[z,])) > 0){

360

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

358

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

361

}

359

}

362

if(length(grep("NA",RAWWORD[z,])) > 0){

360

if(length(grep("NA",RAWWORD[z,])) > 0){

363

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

361

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

364

}

362

}

365

z <- z + 1

363

z <- z + 1

366

}

364

}

367

365

368

colnames(naroww) <- "ROW_NAs"

366

colnames(naroww) <- "ROW_NAs"

369

RAWWORD <- bind_cols(RAWWORD,naroww)

367

RAWWORD <- bind_cols(RAWWORD,naroww)

370

368

371

369

372

roALZna <- t(ALZDAT) %>%

370

roALZna <- t(ALZDAT) %>%

373

rownames(.) %>%

371

rownames(.) %>%

374

as.data.frame(.)

372

as.data.frame(.)

375

colnames(roALZna) <- "ID_REF"

373

colnames(roALZna) <- "ID_REF"

376

374

377

RAWDAT <- t(ALZDAT) %>%

375

RAWDAT <- t(ALZDAT) %>%

378

as.data.frame(.)

376

as.data.frame(.)

379

colnames(RAWDAT) <- NULL

377

colnames(RAWDAT) <- NULL

380

rownames(RAWDAT) <- NULL

378

rownames(RAWDAT) <- NULL

381

379

382

RAWDAT2 <- RAWDAT %>%

380

RAWDAT2 <- RAWDAT %>%

383

cbind(roALZna,.) %>%

381

cbind(roALZna,.) %>%

384

dplyr::arrange(.,ID_REF)

382

dplyr::arrange(.,ID_REF)

385

383

386

##Editing the file for R processing

384

##Editing the file for R processing

387

RAWDATID <- RAWDAT2[,1] %>%

385

RAWDATID <- RAWDAT2[,1] %>%

388

as.matrix(.)

386

as.matrix(.)

389

387

390

RAWDATNUM <- RAWDAT2[,-1] %>%

388

RAWDATNUM <- RAWDAT2[,-1] %>%

391

mapply(.,FUN = as.numeric) %>%

389

mapply(.,FUN = as.numeric) %>%

392

t(.)

390

t(.)

393

391

394

##Consolidating genes with the same name

392

##Consolidating genes with the same name

395

###create empty matrix of size equal to tabRDATID

393

###create empty matrix of size equal to tabRDATID

396

tabRDATID <- table(RAWDATID)

394

tabRDATID <- table(RAWDATID)

397

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

395

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

398

j <- 1

396

j <- 1

399

for(j in 1:length(tabRDATID)){

397

for(j in 1:length(tabRDATID)){

400

##Putting the ones without duplicates in their new homes

398

##Putting the ones without duplicates in their new homes

401

if(tabRDATID[j] == 1){

399

if(tabRDATID[j] == 1){

402

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

400

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

403

} else if(tabRDATID[j] > 1){

401

} else if(tabRDATID[j] > 1){

404

##Averaging duplicates and putting them in their new homes

402

##Averaging duplicates and putting them in their new homes

405

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

403

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

406

}

404

}

407

j <- j + 1

405

j <- j + 1

408

}

406

}

409

407

410

##Scaling the Data

408

##Scaling the Data

411

scrawdat <- NuRDATN%>%

409

scrawdat <- NuRDATN%>%

412

scale()

410

scale()

413

attr(scrawdat,"scaled:center") <- NULL

411

attr(scrawdat,"scaled:center") <- NULL

414

attr(scrawdat,"scaled:scale") <- NULL

412

attr(scrawdat,"scaled:scale") <- NULL

415

colnames(scrawdat) <- rownames(tabRDATID)

413

colnames(scrawdat) <- rownames(tabRDATID)

416

414

417

##Discretized the Data

415

##Discretized the Data

418

dialzdat <- scrawdat %>%

416

dialzdat <- scrawdat %>%

419

dndat(.) %>%

417

dndat(.) %>%

420

t()%>%

418

t()%>%

421

as.data.frame(.)

419

as.data.frame(.)

422

colnames(dialzdat) <- rownames(RAWDATNUM)

420

colnames(dialzdat) <- rownames(RAWDATNUM)

423

421

424

##setting "ID_REF" as a new variable

422

##setting "ID_REF" as a new variable

425

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

423

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

426

colnames(geneNAM) <- "ID_REF"

424

colnames(geneNAM) <- "ID_REF"

427

rownames(dialzdat) <- NULL

425

rownames(dialzdat) <- NULL

428

dialzdat <-bind_cols(geneNAM,dialzdat)

426

dialzdat <-bind_cols(geneNAM,dialzdat)

429

427

430

##NAs in a column

428

##NAs in a column

431

x <- 2

429

x <- 2

432

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

430

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

433

nacol[1,1] = "COL_NAs"

431

nacol[1,1] = "COL_NAs"

434

for(x in 2:dim(dialzdat)[2]){

432

for(x in 2:dim(dialzdat)[2]){

435

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

433

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

436

x <- x + 1

434

x <- x + 1

437

}

435

}

438

colnames(nacol) <- colnames(dialzdat)

436

colnames(nacol) <- colnames(dialzdat)

439

dialzdat <- bind_rows(dialzdat,nacol)

437

dialzdat <- bind_rows(dialzdat,nacol)

440

438

441

##NAs in a row

439

##NAs in a row

442

y <- 1

440

y <- 1

443

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

441

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

444

for(y in 1:dim(dialzdat)[1]){

442

for(y in 1:dim(dialzdat)[1]){

445

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

443

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

446

y <- y + 1

444

y <- y + 1

447

}

445

}

448

colnames(narowd) <- "ROW_NAs"

446

colnames(narowd) <- "ROW_NAs"

449

dialzdat <- bind_cols(dialzdat,narowd)

447

dialzdat <- bind_cols(dialzdat,narowd)

450

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

448

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

451

colnames(RAWWORD) <- colnames(dialzdat)

449

colnames(RAWWORD) <- colnames(dialzdat)

452

##converting to character so that the clinical can be brought together with discrete data

450

##converting to character so that the clinical can be brought together with discrete data

453

k <- 2

451

k <- 2

454

for(k in 2:dim(dialzdat)[2]-1){

452

for(k in 2:dim(dialzdat)[2]-1){

455

dialzdat[,k] <- as.character(dialzdat[,k])

453

dialzdat[,k] <- as.character(dialzdat[,k])

456

k <- k + 1

454

k <- k + 1

457

}

455

}

458

#The End the full data

456

#The End the full data

459

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

457

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

460

458

461

#Produces Discrete file

459

#Produces Discrete file

462

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

460

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

463

.[[1]] %>%

461

.[[1]] %>%

464

.[length(.)] %>%

462

.[length(.)] %>%

465

gsub("\\D","",.) %>%

463

gsub("\\D","",.) %>%

466

c("GSE",.,"dscrt.txt") %>%

464

c("GSE",.,"dscrt.txt") %>%

467

paste(collapse = "")

465

paste(collapse = "")

468

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

466

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

469

n <- n +1

467

n <- n +1

470

}

468

}

471

} else if(numDAT == 2){

469

} else if(numDAT == 2){

472

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

470

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

473

471

474

#All the files you want to analyze

472

#All the files you want to analyze

475

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

473

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

476

if(length(ANDIS) == 0){

474

if(length(ANDIS) == 0){

477

#Spit out a warning

475

#Spit out a warning

478

warning("You did not select any files and so no cleaning will be performed")

476

warning("You did not select any files and so no cleaning will be performed")

479

} else{

477

} else{

480

#indexing the data files

478

#indexing the data files

481

n <- 1

479

n <- 1

482

for(n in 1: length(ANDIS)){

480

for(n in 1: length(ANDIS)){

483

alz <- ANDIS[n]

481

alz <- ANDIS[n]

484

482

485

#Working with the wordy part of the document

483

#Working with the wordy part of the document

486

alzword <- alz %>%

484

alzword <- alz %>%

487

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

485

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

488

filter(grepl("!Sample",X1))%>%

486

filter(grepl("!Sample",X1))%>%

489

filter(!grepl("!Sample_contact",X1))

487

filter(!grepl("!Sample_contact",X1))

490

488

491

#Getting the GPL file

489

#Getting the GPL file

492

genena <- grep("_platform_id",alzword$X1) %>%

490

genena <- grep("_platform_id",alzword$X1) %>%

493

alzword$X2[.] %>%

491

alzword$X2[.] %>%

494

str_trim(.) %>%

492

str_trim(.) %>%

495

paste0("^",.,"\\D") %>%

493

paste0("^",.,"\\D") %>%

496

grep(.,list.files()) %>%

494

grep(.,list.files()) %>%

497

list.files()[.]

495

list.files()[.]

498

496

499

#Find out if it is a soft GPL file or not

497

#Find out if it is a soft GPL file or not

500

soft <- strsplit(genena,"[\\|/]") %>%

498

soft <- strsplit(genena,"[\\|/]") %>%

501

.[[1]] %>%

499

.[[1]] %>%

502

.[length(.)] %>%

500

.[length(.)] %>%

503

grepl("soft",.)

501

grepl("soft",.)

504

502

505

##Changing row names and column names:

503

##Changing row names and column names:

506

ALZWORD <- t(alzword)

504

ALZWORD <- t(alzword)

507

rownames(ALZWORD)=NULL

505

rownames(ALZWORD)=NULL

508

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

506

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

509

ALZWORD <- chngrownm(ALZWORD)[-1,]

507

ALZWORD <- chngrownm(ALZWORD)[-1,]

510

ALZWORD <- ALZWORD%>%

508

ALZWORD <- ALZWORD%>%

511

as.data.frame()%>%

509

as.data.frame()%>%

512

dplyr::select(-starts_with("col"))

510

dplyr::select(-starts_with("col"))

513

511

514

##Reorganizing information within the columns and final clinical data

512

##Reorganizing information within the columns and final clinical data

515

ALZWORDF <- cinfo(ALZWORD)

513

ALZWORDF <- cinfo(ALZWORD)

516

514

517

515

518

#Working with Actual Data part of file

516

#Working with Actual Data part of file

519

alzdat <- alz %>%

517

alzdat <- alz %>%

520

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

518

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

521

ALZDAT <- t(alzdat[,-1])

519

ALZDAT <- t(alzdat[,-1])

522

rownames(ALZDAT)=NULL

520

rownames(ALZDAT)=NULL

523

521

524

##Is there a clean version of the GPL file available?

522

##Is there a clean version of the GPL file available?

525

gplnum <- strsplit(genena,"[\\|/]") %>%

523

gplnum <- strsplit(genena,"[\\|/]") %>%

526

.[[1]] %>%

524

.[[1]] %>%

527

.[length(.)] %>%

525

.[length(.)] %>%

528

gsub("\\D","",.)

526

gsub("\\D","",.)

529

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

527

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

530

if(clfileex >= 1){

528

if(clfileex >= 1){

531

#use the clean version

529

#use the clean version

532

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

530

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

533

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

531

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

534

532

535

} else if(clfileex == 0){

533

} else if(clfileex == 0){

536

##Lets Create a clean version

534

##Lets Create a clean version

537

535

538

##Gene ID to Gene Name

536

##Gene ID to Gene Name

539

if(soft == TRUE){

537

if(soft == TRUE){

540

#Check to see if there is already a file containing information on soft files

538

#Check to see if there is already a file containing information on soft files

541

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

539

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

542

if(fileex == 1){

540

if(fileex == 1){

543

#Check to see if this GPL soft file has been used before

541

#Check to see if this GPL soft file has been used before

544

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

542

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

545

.$GPL_FILE_NUM%>%

543

.$GPL_FILE_NUM%>%

546

grepl(gplnum,.) %>%

544

grepl(gplnum,.) %>%

547

sum()

545

sum()

548

if(IDF == 1){

546

if(IDF == 1){

549

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

547

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

550

.$GPL_FILE_NUM%>%

548

.$GPL_FILE_NUM%>%

551

grep(gplnum,.)

549

grep(gplnum,.)

552

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

550

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

553

.$LOC_ID %>%

551

.$LOC_ID %>%

554

.[IDLOCAL]

552

.[IDLOCAL]

555

geneIDNam <- genena %>%

553

geneIDNam <- genena %>%

556

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

554

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

557

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

555

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

558

} else if(IDF == 0){

556

} else if(IDF == 0){

559

#No information on this particular GPL file

557

#No information on this particular GPL file

560

idLOCGPL <- genena %>%

558

idLOCGPL <- genena %>%

561

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

559

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

562

t(.) %>%

560

t(.) %>%

563

grep("^ID\\s*$",.) %>%

561

grep("^ID\\s*$",.) %>%

564

-1

562

-1

565

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

563

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

566

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

564

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

567

geneIDNam <- genena %>%

565

geneIDNam <- genena %>%

568

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

566

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

569

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

567

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

570

}

568

}

571

} else if(fileex == 0){

569

} else if(fileex == 0){

572

#We must create a file that we can access for later use

570

#We must create a file that we can access for later use

573

idLOCGPL <- genena %>%

571

idLOCGPL <- genena %>%

574

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

572

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

575

t(.) %>%

573

t(.) %>%

576

grep("^ID\\s*$",.) %>%

574

grep("^ID\\s*$",.) %>%

577

-1

575

-1

578

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

576

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

579

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

577

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

580

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

578

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

581

geneIDNam <- genena %>%

579

geneIDNam <- genena %>%

582

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

580

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

583

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

581

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

584

}

582

}

585

} else if(soft == FALSE){

583

} else if(soft == FALSE){

586

geneIDNam <- genena %>%

584

geneIDNam <- genena %>%

587

read_delim(delim="\t",comment = "#")%>%

585

read_delim(delim="\t",comment = "#")%>%

588

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

586

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

589

}

587

}

590

588

591

##Labeling the gene IDs without names

589

##Labeling the gene IDs without names

592

geneIDNam <- NAFIXING(geneIDNam)

590

geneIDNam <- NAFIXING(geneIDNam)

593

591

594

##remove the whitespace

592

##remove the whitespace

595

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

593

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

596

594

597

##Here is the clean version

595

##Here is the clean version

598

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

596

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

599

}

597

}

600

598

601

599

602

600

603

##Changing the gene ID to gene name

601

##Changing the gene ID to gene name

604

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

602

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

605

colnames(ALZDAT) = ALZDAT1[1,]

603

colnames(ALZDAT) = ALZDAT1[1,]

606

604

607

605

608

##Adjusting the column names aka the gene names

606

##Adjusting the column names aka the gene names

609

colnames(ALZDAT) <- gcnames(ALZDAT)

607

colnames(ALZDAT) <- gcnames(ALZDAT)

610

608

611

609

612

#Full RAW Data

610

#Full RAW Data

613

Fullalzdwr <- ALZDAT %>%

611

Fullalzdwr <- ALZDAT %>%

614

as.data.frame() %>%

612

as.data.frame() %>%

615

cbind(ALZWORDF,.)

613

cbind(ALZWORDF,.)

616

614

617

#Raw file is output

615

#Raw file is output

618

nfnaex <- strsplit(alz,"[\\]") %>%

616

nfnaex <- strsplit(alz,"[\\]") %>%

619

.[[1]] %>%

617

.[[1]] %>%

620

.[length(.)] %>%

618

.[length(.)] %>%

621

gsub("\\D","",.) %>%

619

gsub("\\D","",.) %>%

622

c("GSE",.,"aftexcel.txt") %>%

620

c("GSE",.,"aftexcel.txt") %>%

623

paste(collapse = "")

621

paste(collapse = "")

624

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

622

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

625

623

626

624

627

625

628

#Now for the discretization part

626

#Now for the discretization part

629

##get the wordy part again

627

##get the wordy part again

630

rawword <- t(ALZWORDF)

628

rawword <- t(ALZWORDF)

631

629

632

##where is ID_REF located

630

##where is ID_REF located

633

hereim <- grep("ID_REF",rownames(rawword))

631

hereim <- grep("ID_REF",rownames(rawword))

634

632

635

##Subject Names GSM...

633

##Subject Names GSM...

636

subjnam <- rawword[hereim,]

634

subjnam <- rawword[hereim,]

637

635

638

##Getting the names for the rows

636

##Getting the names for the rows

639

namedarows <- rownames(rawword)[-hereim] %>%

637

namedarows <- rownames(rawword)[-hereim] %>%

640

as.data.frame()

638

as.data.frame()

641

RAWWORD <- rawword[-hereim,] %>%

639

RAWWORD <- rawword[-hereim,] %>%

642

as.data.frame() %>%

640

as.data.frame() %>%

643

bind_cols(namedarows,.)

641

bind_cols(namedarows,.)

644

z <- 1

642

z <- 1

645

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

643

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

646

for(z in 1:dim(RAWWORD)[1]){

644

for(z in 1:dim(RAWWORD)[1]){

647

if(sum(is.na(RAWWORD[z,])) > 0){

645

if(sum(is.na(RAWWORD[z,])) > 0){

648

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

646

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

649

}

647

}

650

if(length(grep("NA",RAWWORD[z,])) > 0){

648

if(length(grep("NA",RAWWORD[z,])) > 0){

651

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

649

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

652

}

650

}

653

z <- z + 1

651

z <- z + 1

654

}

652

}

655

653

656

colnames(naroww) <- "ROW_NAs"

654

colnames(naroww) <- "ROW_NAs"

657

RAWWORD <- bind_cols(RAWWORD,naroww)

655

RAWWORD <- bind_cols(RAWWORD,naroww)

658

656

659

657

660

roALZna <- t(ALZDAT) %>%

658

roALZna <- t(ALZDAT) %>%

661

rownames(.) %>%

659

rownames(.) %>%

662

as.data.frame(.)

660

as.data.frame(.)

663

colnames(roALZna) <- "ID_REF"

661

colnames(roALZna) <- "ID_REF"

664

662

665

RAWDAT <- t(ALZDAT) %>%

663

RAWDAT <- t(ALZDAT) %>%

666

as.data.frame(.)

664

as.data.frame(.)

667

colnames(RAWDAT) <- NULL

665

colnames(RAWDAT) <- NULL

668

rownames(RAWDAT) <- NULL

666

rownames(RAWDAT) <- NULL

669

667

670

RAWDAT2 <- RAWDAT %>%

668

RAWDAT2 <- RAWDAT %>%

671

cbind(roALZna,.) %>%

669

cbind(roALZna,.) %>%

672

dplyr::arrange(.,ID_REF)

670

dplyr::arrange(.,ID_REF)

673

671

674

##Editing the file for R processing

672

##Editing the file for R processing

675

RAWDATID <- RAWDAT2[,1] %>%

673

RAWDATID <- RAWDAT2[,1] %>%

676

as.matrix(.)

674

as.matrix(.)

677

675

678

RAWDATNUM <- RAWDAT2[,-1] %>%

676

RAWDATNUM <- RAWDAT2[,-1] %>%

679

mapply(.,FUN = as.numeric) %>%

677

mapply(.,FUN = as.numeric) %>%

680

t(.)

678

t(.)

681

679

682

##Consolidating genes with the same name

680

##Consolidating genes with the same name

683

###create empty matrix of size equal to tabRDATID

681

###create empty matrix of size equal to tabRDATID

684

tabRDATID <- table(RAWDATID)

682

tabRDATID <- table(RAWDATID)

685

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

683

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

686

j <- 1

684

j <- 1

687

for(j in 1:length(tabRDATID)){

685

for(j in 1:length(tabRDATID)){

688

##Putting the ones without duplicates in their new homes

686

##Putting the ones without duplicates in their new homes

689

if(tabRDATID[j] == 1){

687

if(tabRDATID[j] == 1){

690

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

688

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

691

} else if(tabRDATID[j] > 1){

689

} else if(tabRDATID[j] > 1){

692

##Averaging duplicates and putting them in their new homes

690

##Averaging duplicates and putting them in their new homes

693

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

691

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

694

}

692

}

695

j <- j + 1

693

j <- j + 1

696

}

694

}

697

695

698

##Scaling the Data

696

##Scaling the Data

699

scrawdat <- NuRDATN%>%

697

scrawdat <- NuRDATN%>%

700

scale()

698

scale()

701

attr(scrawdat,"scaled:center") <- NULL

699

attr(scrawdat,"scaled:center") <- NULL

702

attr(scrawdat,"scaled:scale") <- NULL

700

attr(scrawdat,"scaled:scale") <- NULL

703

colnames(scrawdat) <- rownames(tabRDATID)

701

colnames(scrawdat) <- rownames(tabRDATID)

704

702

705

##Discretized the Data

703

##Discretized the Data

706

dialzdat <- scrawdat %>%

704

dialzdat <- scrawdat %>%

707

dndat(.) %>%

705

dndat(.) %>%

708

t()%>%

706

t()%>%

709

as.data.frame(.)

707

as.data.frame(.)

710

colnames(dialzdat) <- rownames(RAWDATNUM)

708

colnames(dialzdat) <- rownames(RAWDATNUM)

711

709

712

##setting "ID_REF" as a new variable

710

##setting "ID_REF" as a new variable

713

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

711

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

714

colnames(geneNAM) <- "ID_REF"

712

colnames(geneNAM) <- "ID_REF"

715

rownames(dialzdat) <- NULL

713

rownames(dialzdat) <- NULL

716

dialzdat <-bind_cols(geneNAM,dialzdat)

714

dialzdat <-bind_cols(geneNAM,dialzdat)

717

715

718

##NAs in a column

716

##NAs in a column

719

x <- 2

717

x <- 2

720

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

718

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

721

nacol[1,1] = "COL_NAs"

719

nacol[1,1] = "COL_NAs"

722

for(x in 2:dim(dialzdat)[2]){

720

for(x in 2:dim(dialzdat)[2]){

723

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

721

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

724

x <- x + 1

722

x <- x + 1

725

}

723

}

726

colnames(nacol) <- colnames(dialzdat)

724

colnames(nacol) <- colnames(dialzdat)

727

dialzdat <- bind_rows(dialzdat,nacol)

725

dialzdat <- bind_rows(dialzdat,nacol)

728

726

729

##NAs in a row

727

##NAs in a row

730

y <- 1

728

y <- 1

731

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

729

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

732

for(y in 1:dim(dialzdat)[1]){

730

for(y in 1:dim(dialzdat)[1]){

733

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

731

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

734

y <- y + 1

732

y <- y + 1

735

}

733

}

736

colnames(narowd) <- "ROW_NAs"

734

colnames(narowd) <- "ROW_NAs"

737

dialzdat <- bind_cols(dialzdat,narowd)

735

dialzdat <- bind_cols(dialzdat,narowd)

738

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

736

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

739

colnames(RAWWORD) <- colnames(dialzdat)

737

colnames(RAWWORD) <- colnames(dialzdat)

740

##converting to character so that the clinical can be brought together with discrete data

738

##converting to character so that the clinical can be brought together with discrete data

741

k <- 2

739

k <- 2

742

for(k in 2:dim(dialzdat)[2]-1){

740

for(k in 2:dim(dialzdat)[2]-1){

743

dialzdat[,k] <- as.character(dialzdat[,k])

741

dialzdat[,k] <- as.character(dialzdat[,k])

744

k <- k + 1

742

k <- k + 1

745

}

743

}

746

#The End the full data

744

#The End the full data

747

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

745

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

748

746

749

#Produces Discrete file

747

#Produces Discrete file

750

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

748

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

751

.[[1]] %>%

749

.[[1]] %>%

752

.[length(.)] %>%

750

.[length(.)] %>%

753

gsub("\\D","",.) %>%

751

gsub("\\D","",.) %>%

754

c("GSE",.,"dscrt.txt") %>%

752

c("GSE",.,"dscrt.txt") %>%

755

paste(collapse = "")

753

paste(collapse = "")

756

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

754

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

757

755

758

756

759

n <- n + 1

757

n <- n + 1

760

}

758

}

761

}

759

}

762

}

760

}

763

}

761

}

764

#The Rest of this code will be used every time you want to change a data set

762

#The Rest of this code will be used every time you want to change a data set

765

THEFT()

763

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Automated version of RCleanDscret.R