Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Efrain H. Gonzalez

1

#Efrain H. Gonzalez

2

#6/19/2017

2

#6/16/2017

3

#Libraries required to run the code

3

#Libraries required to run the code

4

library(pryr)

4

library(pryr)

5

library(MASS)

5

library(MASS)

6

library(dplyr)

6

library(dplyr)

7

library(tidyr)

7

library(tidyr)

8

library(readr)

8

library(readr)

9

library(stringr)

9

library(stringr)

10

11

12

#Necessary Functions

12

#Necessary Functions

13

#1#Function for handling the changing of row names and column names

13

#1#Function for handling the changing of row names and column names

14

chngrownm <- function(mat){

14

chngrownm <- function(mat){

15

row <- dim(mat)[1]

15

row <- dim(mat)[1]

16

col <- dim(mat)[2]

16

col <- dim(mat)[2]

17

j <- 1

17

j <- 1

18

x <- 1

18

x <- 1

19

p <- 1

19

p <- 1

20

a <- 1

20

a <- 1

21

b <- 1

21

b <- 1

22

g <- 1

22

g <- 1

23

for(j in 1:col){

23

for(j in 1:col){

24

if("!Sample_source_name_ch1"==mat[1,j]){

24

if("!Sample_source_name_ch1"==mat[1,j]){

25

colnames(mat)[j] <- "Brain_Region"

25

colnames(mat)[j] <- "Brain_Region"

26

}

26

}

27

if("!Sample_title" == mat[1,j]){

27

if("!Sample_title" == mat[1,j]){

28

colnames(mat)[j] <- "Title"

28

colnames(mat)[j] <- "Title"

29

}

29

}

30

if("!Sample_geo_accession" == mat[1,j]){

30

if("!Sample_geo_accession" == mat[1,j]){

31

colnames(mat)[j] <- "ID_REF"

31

colnames(mat)[j] <- "ID_REF"

32

} else{

32

} else{

33

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

33

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

34

colnames(mat)[j] <- paste0("Sex",x)

34

colnames(mat)[j] <- paste0("Sex",x)

35

x = x + 1

35

x = x + 1

36

}

36

}

37

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

37

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

38

colnames(mat)[j] <- paste0("PMI",p)

38

colnames(mat)[j] <- paste0("PMI",p)

39

p = p + 1

39

p = p + 1

40

}

40

}

41

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

41

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

42

colnames(mat)[j] <- paste0("Age",a)

42

colnames(mat)[j] <- paste0("Age",a)

43

a = a + 1

43

a = a + 1

44

}

44

}

45

if(grepl("braak|b&b",mat[2,j])==TRUE){

45

if(grepl("braak|b&b",mat[2,j])==TRUE){

46

colnames(mat)[j] <- paste0("Braak",b)

46

colnames(mat)[j] <- paste0("Braak",b)

47

b = b + 1

47

b = b + 1

48

}

48

}

49

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

49

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

50

colnames(mat)[j] <- paste0("Group",g)

50

colnames(mat)[j] <- paste0("Group",g)

51

g = g + 1

51

g = g + 1

52

}

52

}

53

54

}

54

}

55

j = j + 1

55

j = j + 1

56

}

56

}

57

mat

57

mat

58

}

58

}

59

60

#2#Function for reorganizing information within the columns

60

#2#Function for reorganizing information within the columns

61

cinfo <- function(mat){

61

cinfo <- function(mat){

62

col <- dim(mat)[2]

62

col <- dim(mat)[2]

63

j <-2

63

j <-2

64

for(j in 2:col){

64

for(j in 2:col){

65

if(grepl("Group",colnames(mat)[j]) == TRUE){

65

if(grepl("Group",colnames(mat)[j]) == TRUE){

66

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

66

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

67

}

67

}

68

if(grepl("Age",colnames(mat)[j])==TRUE){

68

if(grepl("Age",colnames(mat)[j])==TRUE){

69

mat[,j] <- gsub("\\D","",mat[,j])%>%

69

mat[,j] <- gsub("\\D","",mat[,j])%>%

70

as.integer()

70

as.integer()

71

}

71

}

72

if(grepl("Sex",colnames(mat)[j])==TRUE){

72

if(grepl("Sex",colnames(mat)[j])==TRUE){

73

mat[,j] <- gsub(".+:\\s","",mat[,j])

73

mat[,j] <- gsub(".+:\\s","",mat[,j])

74

}

74

}

75

if(grepl("PMI",colnames(mat)[j])==TRUE){

75

if(grepl("PMI",colnames(mat)[j])==TRUE){

76

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

76

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

77

as.numeric()

77

as.numeric()

78

}

78

}

79

if(grepl("Braak",colnames(mat)[j])==TRUE){

79

if(grepl("Braak",colnames(mat)[j])==TRUE){

80

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

80

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

81

as.roman()%>%

81

as.roman()%>%

82

as.integer()

82

as.integer()

83

}

83

}

84

j=j+1

84

j=j+1

85

}

85

}

86

mat

86

mat

87

}

87

}

88

89

#3#Function for labeling the gene IDs without names

89

#3#Function for labeling the gene IDs without names

90

NAFIXING <- function(GIDNAM){

90

NAFIXING <- function(GIDNAM){

91

row <- dim(GIDNAM)[1]

91

row <- dim(GIDNAM)[1]

92

i <- 1

92

i <- 1

93

for(i in 1:row){

93

for(i in 1:row){

94

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

94

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

95

GIDNAM[i,2] <- GIDNAM[i,1]

95

GIDNAM[i,2] <- GIDNAM[i,1]

96

}

96

}

97

i <- i + 1

97

i <- i + 1

98

}

98

}

99

GIDNAM

99

GIDNAM

100

}

100

}

101

102

#4#Function for changing the gene ID to gene name

102

#4#Function for changing the gene ID to gene name

103

cgeneID <- function(GeneName,DATA){

103

cgeneID <- function(GeneName,DATA){

104

colGene <- dim(GeneName)[2]

104

colGene <- dim(GeneName)[2]

105

j <- 1

105

j <- 1

106

for(j in 1:colGene){

106

for(j in 1:colGene){

107

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

107

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

108

if(is.na(sum(chngsreq))==FALSE){

108

if(is.na(sum(chngsreq))==FALSE){

109

if(sum(chngsreq) > 0){

109

if(sum(chngsreq) > 0){

110

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

110

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

111

}

111

}

112

}

112

}

113

j = j+1

113

j = j+1

114

}

114

}

115

DATA

115

DATA

116

}

116

}

117

118

#5#Function for adjusting the gene names

118

#5#Function for adjusting the gene names

119

gcnames <- function(DiData,usecol=1){

119

gcnames <- function(DiData,usecol=1){

120

nuruns <- dim(DiData)[2]

120

nuruns <- dim(DiData)[2]

121

i = 1

121

i = 1

122

nwnam <- rep("0",length.out=nuruns)

122

nwnam <- rep("0",length.out=nuruns)

123

for(i in 1:nuruns){

123

for(i in 1:nuruns){

124

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

124

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

125

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

125

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

126

} else{

126

} else{

127

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

127

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

128

}

128

}

129

130

}

130

}

131

nwnam

131

nwnam

132

133

}

133

}

134

135

#6# Function for discretizing the data

135

#6# Function for discretizing the data

136

dndat <- function(NDATA){

136

dndat <- function(NDATA){

137

rownd <- dim(NDATA)[1]

137

rownd <- dim(NDATA)[1]

138

colnd <- dim(NDATA)[2]

138

colnd <- dim(NDATA)[2]

139

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

139

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

140

colnames(DDATA) <- colnames(NDATA)

140

colnames(DDATA) <- colnames(NDATA)

141

i <- 1

141

i <- 1

142

for(i in 1:rownd){

142

for(i in 1:rownd){

143

j <- 1

143

j <- 1

144

for(j in 1:colnd){

144

for(j in 1:colnd){

145

if(is.na(NDATA[i,j])==FALSE){

145

if(is.na(NDATA[i,j])==FALSE){

146

147

if(NDATA[i,j] < -1){

147

if(NDATA[i,j] < -1){

148

DDATA[i,j]=0L

148

DDATA[i,j]=0L

149

}

149

}

150

if(NDATA[i,j] > 1){

150

if(NDATA[i,j] > 1){

151

DDATA[i,j]=2L

151

DDATA[i,j]=2L

152

}

152

}

153

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

153

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

154

DDATA[i,j]=1L

154

DDATA[i,j]=1L

155

}

155

}

156

} else{

156

} else{

157

DDATA[i,j] = NDATA[i,j]

157

DDATA[i,j] = NDATA[i,j]

158

}

158

}

159

j = j + 1

159

j = j + 1

160

}

160

}

161

i = i + 1

161

i = i + 1

162

}

162

}

163

DDATA

163

DDATA

164

}

164

}

165

166

167

#MajorFunction#This is the function that does everything else

167

#MajorFunction#This is the function that does everything else

168

THEFT <- function(){

168

THEFT <- function(){

169

#Set working directory based on the directory of the series matrix file Currently only works for windows

169

#Set working directory based on the directory of the series matrix file Currently only works for windows

170

wd <- getwd()

170

wd <- getwd()

171

#list.files()

171

#list.files()

172

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

172

#gsub("wd",wd,"Do you want to clean all data files in the directory wd?")

173

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

173

numDAT <- switch(EXPR = menu(choices = c("Yes","No"),title = gsub("wd",wd,"Do you want to clean all data files in the directory wd?")) + 1,cat("Nothing done\n"),1L,2L)

174

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

174

GSEfileloc <- grep("^GSE.+\\.txt\\.gz$",list.files())

175

176

#ALL DATA FILES WILL BE CLEANED

176

#ALL DATA FILES WILL BE CLEANED

177

if(numDAT == 1){

177

if(numDAT == 1){

178

#indexing the data files

178

#indexing the data files

179

n <- 1

179

n <- 1

180

for(n in 1: length(GSEfileloc)){

180

for(n in 1: length(GSEfileloc)){

181

alz <- list.files()[GSEfileloc[n]]

181

alz <- list.files()[GSEfileloc[n]]

182

183

#Working with the wordy part of the document

183

#Working with the wordy part of the document

184

alzword <- alz %>%

184

alzword <- alz %>%

185

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

185

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

186

filter(grepl("!Sample",X1))%>%

186

filter(grepl("!Sample",X1))%>%

187

filter(!grepl("!Sample_contact",X1))

187

filter(!grepl("!Sample_contact",X1))

188

189

#Getting the GPL file

189

#Getting the GPL file

190

genena <- grep("_platform_id",alzword$X1) %>%

190

genena <- grep("_platform_id",alzword$X1) %>%

191

alzword$X2[.] %>%

191

alzword$X2[.] %>%

192

str_trim(.) %>%

192

str_trim(.) %>%

193

paste0("^",.) %>%

193

paste0("^",.) %>%

194

grep(.,list.files()) %>%

194

grep(.,list.files()) %>%

195

list.files()[.]

195

list.files()[.]

196

197

#Find out if it is a soft GPL file or not

197

#Find out if it is a soft GPL file or not

198

soft <- strsplit(genena,"[\\|/]") %>%

198

soft <- strsplit(genena,"[\\|/]") %>%

199

.[[1]] %>%

199

.[[1]] %>%

200

.[length(.)] %>%

200

.[length(.)] %>%

201

grepl("soft",.)

201

grepl("soft",.)

202

203

##Changing row names and column names:

203

##Changing row names and column names:

204

ALZWORD <- t(alzword)

204

ALZWORD <- t(alzword)

205

rownames(ALZWORD)=NULL

205

rownames(ALZWORD)=NULL

206

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

206

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

207

ALZWORD <- chngrownm(ALZWORD)[-1,]

207

ALZWORD <- chngrownm(ALZWORD)[-1,]

208

ALZWORD <- ALZWORD%>%

208

ALZWORD <- ALZWORD%>%

209

as.data.frame()%>%

209

as.data.frame()%>%

210

dplyr::select(-starts_with("col"))

210

dplyr::select(-starts_with("col"))

211

212

##Reorganizing information within the columns and final clinical data

212

##Reorganizing information within the columns and final clinical data

213

ALZWORDF <- cinfo(ALZWORD)

213

ALZWORDF <- cinfo(ALZWORD)

214

215

216

#Working with Actual Data part of file

216

#Working with Actual Data part of file

217

alzdat <- alz %>%

217

alzdat <- alz %>%

218

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

218

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

219

ALZDAT <- t(alzdat[,-1])

219

ALZDAT <- t(alzdat[,-1])

220

rownames(ALZDAT)=NULL

220

rownames(ALZDAT)=NULL

221

222

##Is there a clean version of the GPL file available?

222

##Is there a clean version of the GPL file available?

223

gplnum <- strsplit(genena,"[\\|/]") %>%

223

gplnum <- strsplit(genena,"[\\|/]") %>%

224

.[[1]] %>%

224

.[[1]] %>%

225

.[length(.)] %>%

225

.[length(.)] %>%

226

gsub("\\D","",.)

226

gsub("\\D","",.)

227

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

227

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

228

if(clfileex >= 1){

228

if(clfileex >= 1){

229

#use the clean version

229

#use the clean version

230

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

230

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

231

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

231

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

232

233

}

233

}

234

if(clfileex == 0){

234

if(clfileex == 0){

235

##Lets Create a clean version

235

##Lets Create a clean version

236

237

##Gene ID to Gene Name

237

##Gene ID to Gene Name

238

if(soft == TRUE){

238

if(soft == TRUE){

239

#Check to see if there is already a file containing information on soft files

239

#Check to see if there is already a file containing information on soft files

240

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

240

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

241

if(fileex == 1){

241

if(fileex == 1){

242

#Check to see if this GPL soft file has been used before

242

#Check to see if this GPL soft file has been used before

243

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

243

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

244

.$GPL_FILE_NUM%>%

244

.$GPL_FILE_NUM%>%

245

grepl(gplnum,.) %>%

245

grepl(gplnum,.) %>%

246

sum()

246

sum()

247

if(IDF == 1){

247

if(IDF == 1){

248

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

248

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

249

.$GPL_FILE_NUM%>%

249

.$GPL_FILE_NUM%>%

250

grep(gplnum,.)

250

grep(gplnum,.)

251

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

251

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

252

.$LOC_ID %>%

252

.$LOC_ID %>%

253

.[IDLOCAL]

253

.[IDLOCAL]

254

geneIDNam <- genena %>%

254

geneIDNam <- genena %>%

255

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

255

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

256

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

256

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

257

}

257

}

258

if(IDF == 0){

258

if(IDF == 0){

259

#No information on this particular GPL file

259

#No information on this particular GPL file

260

idLOCGPL <- genena %>%

260

idLOCGPL <- genena %>%

261

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

261

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

262

t(.) %>%

262

t(.) %>%

263

grep("^ID\\s*$",.) %>%

263

grep("^ID\\s*$",.) %>%

264

-1

264

-1

265

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

265

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

266

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

266

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

267

geneIDNam <- genena %>%

267

geneIDNam <- genena %>%

268

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

268

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

269

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

269

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

270

}

270

}

271

}

271

}

272

if(fileex == 0){

272

if(fileex == 0){

273

#We must create a file that we can access for later use

273

#We must create a file that we can access for later use

274

idLOCGPL <- genena %>%

274

idLOCGPL <- genena %>%

275

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

275

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

276

t(.) %>%

276

t(.) %>%

277

grep("^ID\\s*$",.) %>%

277

grep("^ID\\s*$",.) %>%

278

-1

278

-1

279

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

279

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

280

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

280

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

281

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

281

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

282

geneIDNam <- genena %>%

282

geneIDNam <- genena %>%

283

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

283

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

284

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

284

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

285

}

285

}

286

}

286

}

287

if(soft == FALSE){

287

if(soft == FALSE){

288

geneIDNam <- genena %>%

288

geneIDNam <- genena %>%

289

read_delim(delim="\t",comment = "#")%>%

289

read_delim(delim="\t",comment = "#")%>%

290

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

290

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

291

}

291

}

292

293

##Labeling the gene IDs without names

293

##Labeling the gene IDs without names

294

geneIDNam <- NAFIXING(geneIDNam)

294

geneIDNam <- NAFIXING(geneIDNam)

295

296

##remove the whitespace

296

##remove the whitespace

297

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

297

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

298

299

##Here is the clean version

299

##Here is the clean version

300

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

300

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

301

}

301

}

302

303

304

305

##Changing the gene ID to gene name

305

##Changing the gene ID to gene name

306

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

306

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

307

colnames(ALZDAT) = ALZDAT1[1,]

307

colnames(ALZDAT) = ALZDAT1[1,]

308

309

310

##Adjusting the column names aka the gene names

310

##Adjusting the column names aka the gene names

311

colnames(ALZDAT) <- gcnames(ALZDAT)

311

colnames(ALZDAT) <- gcnames(ALZDAT)

312

313

314

#Full RAW Data

314

#Full RAW Data

315

Fullalzdwr <- ALZDAT %>%

315

Fullalzdwr <- ALZDAT %>%

316

as.data.frame() %>%

316

as.data.frame() %>%

317

cbind(ALZWORDF,.)

317

cbind(ALZWORDF,.)

318

319

#Raw file is output

319

#Raw file is output

320

nfnaex <- strsplit(alz,"[\\]") %>%

320

nfnaex <- strsplit(alz,"[\\]") %>%

321

.[[1]] %>%

321

.[[1]] %>%

322

.[length(.)] %>%

322

.[length(.)] %>%

323

gsub("\\D","",.) %>%

323

gsub("\\D","",.) %>%

324

c("GSE",.,"aftexcel.txt") %>%

324

c("GSE",.,"aftexcel.txt") %>%

325

paste(collapse = "")

325

paste(collapse = "")

326

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

326

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

327

328

329

330

#Now for the discretization part

330

#Now for the discretization part

331

##get the wordy part again

331

##get the wordy part again

332

rawword <- t(ALZWORDF)

332

rawword <- t(ALZWORDF)

333

334

##where is ID_REF located

334

##where is ID_REF located

335

hereim <- grep("ID_REF",rownames(rawword))

335

hereim <- grep("ID_REF",rownames(rawword))

336

337

##Subject Names GSM...

337

##Subject Names GSM...

338

subjnam <- rawword[hereim,]

338

subjnam <- rawword[hereim,]

339

340

##Getting the names for the rows

340

##Getting the names for the rows

341

namedarows <- rownames(rawword)[-hereim] %>%

341

namedarows <- rownames(rawword)[-hereim] %>%

342

as.data.frame()

342

as.data.frame()

343

RAWWORD <- rawword[-hereim,] %>%

343

RAWWORD <- rawword[-hereim,] %>%

344

as.data.frame() %>%

344

as.data.frame() %>%

345

bind_cols(namedarows,.)

345

bind_cols(namedarows,.)

346

z <- 1

346

z <- 1

347

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

347

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

348

for(z in 1:dim(RAWWORD)[1]){

348

for(z in 1:dim(RAWWORD)[1]){

349

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

349

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

350

z <- z + 1

350

z <- z + 1

351

}

351

}

352

353

colnames(naroww) <- "ROW_NAs"

353

colnames(naroww) <- "ROW_NAs"

354

RAWWORD <- bind_cols(RAWWORD,naroww)

354

RAWWORD <- bind_cols(RAWWORD,naroww)

355

356

357

roALZna <- t(ALZDAT) %>%

357

roALZna <- t(ALZDAT) %>%

358

rownames(.) %>%

358

rownames(.) %>%

359

as.data.frame(.)

359

as.data.frame(.)

360

colnames(roALZna) <- "ID_REF"

360

colnames(roALZna) <- "ID_REF"

361

362

RAWDAT <- t(ALZDAT) %>%

362

RAWDAT <- t(ALZDAT) %>%

363

as.data.frame(.)

363

as.data.frame(.)

364

colnames(RAWDAT) <- NULL

364

colnames(RAWDAT) <- NULL

365

rownames(RAWDAT) <- NULL

365

rownames(RAWDAT) <- NULL

366

367

RAWDAT2 <- RAWDAT %>%

367

RAWDAT2 <- RAWDAT %>%

368

cbind(roALZna,.) %>%

368

cbind(roALZna,.) %>%

369

dplyr::arrange(.,ID_REF)

369

dplyr::arrange(.,ID_REF)

370

371

##Editing the file for R processing

371

##Editing the file for R processing

372

RAWDATID <- RAWDAT2[,1] %>%

372

RAWDATID <- RAWDAT2[,1] %>%

373

as.matrix(.)

373

as.matrix(.)

374

375

RAWDATNUM <- RAWDAT2[,-1] %>%

375

RAWDATNUM <- RAWDAT2[,-1] %>%

376

mapply(.,FUN = as.numeric) %>%

376

mapply(.,FUN = as.numeric) %>%

377

t(.)

377

t(.)

378

379

##Consolidating genes with the same name

379

##Consolidating genes with the same name

380

###create empty matrix of size equal to tabRDATID

380

###create empty matrix of size equal to tabRDATID

381

tabRDATID <- table(RAWDATID)

381

tabRDATID <- table(RAWDATID)

382

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

382

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

383

j <- 1

383

j <- 1

384

for(j in 1:length(tabRDATID)){

384

for(j in 1:length(tabRDATID)){

385

##Putting the ones without duplicates in their new homes

385

##Putting the ones without duplicates in their new homes

386

if(tabRDATID[j] == 1){

386

if(tabRDATID[j] == 1){

387

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

387

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

388

}

388

}

389

##Averaging duplicates and putting them in their new homes

389

##Averaging duplicates and putting them in their new homes

390

if(tabRDATID[j] > 1){

390

if(tabRDATID[j] > 1){

391

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

391

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

392

}

392

}

393

j <- j + 1

393

j <- j + 1

394

}

394

}

395

396

##Scaling the Data

396

##Scaling the Data

397

scrawdat <- NuRDATN%>%

397

scrawdat <- NuRDATN%>%

398

scale()

398

scale()

399

attr(scrawdat,"scaled:center") <- NULL

399

attr(scrawdat,"scaled:center") <- NULL

400

attr(scrawdat,"scaled:scale") <- NULL

400

attr(scrawdat,"scaled:scale") <- NULL

401

colnames(scrawdat) <- rownames(tabRDATID)

401

colnames(scrawdat) <- rownames(tabRDATID)

402

403

##Discretized the Data

403

##Discretized the Data

404

dialzdat <- scrawdat %>%

404

dialzdat <- scrawdat %>%

405

dndat(.) %>%

405

dndat(.) %>%

406

t()%>%

406

t()%>%

407

as.data.frame(.)

407

as.data.frame(.)

408

colnames(dialzdat) <- rownames(RAWDATNUM)

408

colnames(dialzdat) <- rownames(RAWDATNUM)

409

410

##setting "ID_REF" as a new variable

410

##setting "ID_REF" as a new variable

411

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

411

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

412

colnames(geneNAM) <- "ID_REF"

412

colnames(geneNAM) <- "ID_REF"

413

rownames(dialzdat) <- NULL

413

rownames(dialzdat) <- NULL

414

dialzdat <-bind_cols(geneNAM,dialzdat)

414

dialzdat <-bind_cols(geneNAM,dialzdat)

415

416

##NAs in a column

416

##NAs in a column

417

x <- 2

417

x <- 2

418

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

418

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

419

nacol[1,1] = "COL_NAs"

419

nacol[1,1] = "COL_NAs"

420

for(x in 2:dim(dialzdat)[2]){

420

for(x in 2:dim(dialzdat)[2]){

421

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

421

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

422

x <- x + 1

422

x <- x + 1

423

}

423

}

424

colnames(nacol) <- colnames(dialzdat)

424

colnames(nacol) <- colnames(dialzdat)

425

dialzdat <- bind_rows(dialzdat,nacol)

425

dialzdat <- bind_rows(dialzdat,nacol)

426

427

##NAs in a row

427

##NAs in a row

428

y <- 1

428

y <- 1

429

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

429

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

430

for(y in 1:dim(dialzdat)[1]){

430

for(y in 1:dim(dialzdat)[1]){

431

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

431

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

432

y <- y + 1

432

y <- y + 1

433

}

433

}

434

colnames(narowd) <- "ROW_NAs"

434

colnames(narowd) <- "ROW_NAs"

435

dialzdat <- bind_cols(dialzdat,narowd)

435

dialzdat <- bind_cols(dialzdat,narowd)

436

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

436

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

437

colnames(RAWWORD) <- colnames(dialzdat)

437

colnames(RAWWORD) <- colnames(dialzdat)

438

##converting to character so that the clinical can be brought together with discrete data

438

##converting to character so that the clinical can be brought together with discrete data

439

k <- 2

439

k <- 2

440

for(k in 2:dim(dialzdat)[2]-1){

440

for(k in 2:dim(dialzdat)[2]-1){

441

dialzdat[,k] <- as.character(dialzdat[,k])

441

dialzdat[,k] <- as.character(dialzdat[,k])

442

k <- k + 1

442

k <- k + 1

443

}

443

}

444

#The End the full data

444

#The End the full data

445

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

445

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

446

447

#Produces Discrete file

447

#Produces Discrete file

448

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

448

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

449

.[[1]] %>%

449

.[[1]] %>%

450

.[length(.)] %>%

450

.[length(.)] %>%

451

gsub("\\D","",.) %>%

451

gsub("\\D","",.) %>%

452

c("GSE",.,"dscrt.txt") %>%

452

c("GSE",.,"dscrt.txt") %>%

453

paste(collapse = "")

453

paste(collapse = "")

454

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

454

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

455

n <- n +1

455

n <- n +1

456

}

456

}

457

}

457

}

458

459

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

459

#CHOOSE A DATA FILE TO CLEAN OR SEVERAL DATA FILES TO CLEAN

460

if(numDAT == 2){

460

if(numDAT == 2){

461

#All the files you want to analyze

461

#All the files you want to analyze

462

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

462

ANDIS <- select.list(choices = list.files()[GSEfileloc],multiple = TRUE, title = "Choose the file/files you want to analyze:")

463

if(length(ANDIS) == 0){

463

if(length(ANDIS) == 0){

464

#Spit out a warning

464

#Spit out a warning

465

warning("You did not select any files and so no cleaning will be performed")

465

warning("You did not select any files and so no cleaning will be performed")

466

} else{

466

} else{

467

#indexing the data files

467

#indexing the data files

468

n <- 1

468

n <- 1

469

for(n in 1: length(ANDIS)){

469

for(n in 1: length(ANDIS)){

470

alz <- ANDIS[n]

470

alz <- ANDIS[n]

471

472

#Working with the wordy part of the document

472

#Working with the wordy part of the document

473

alzword <- alz %>%

473

alzword <- alz %>%

474

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

474

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

475

filter(grepl("!Sample",X1))%>%

475

filter(grepl("!Sample",X1))%>%

476

filter(!grepl("!Sample_contact",X1))

476

filter(!grepl("!Sample_contact",X1))

477

478

#Getting the GPL file

478

#Getting the GPL file

479

genena <- grep("_platform_id",alzword$X1) %>%

479

genena <- grep("_platform_id",alzword$X1) %>%

480

alzword$X2[.] %>%

480

alzword$X2[.] %>%

481

str_trim(.) %>%

481

str_trim(.) %>%

482

paste0("^",.) %>%

482

paste0("^",.) %>%

483

grep(.,list.files()) %>%

483

grep(.,list.files()) %>%

484

list.files()[.]

484

list.files()[.]

485

486

#Find out if it is a soft GPL file or not

486

#Find out if it is a soft GPL file or not

487

soft <- strsplit(genena,"[\\|/]") %>%

487

soft <- strsplit(genena,"[\\|/]") %>%

488

.[[1]] %>%

488

.[[1]] %>%

489

.[length(.)] %>%

489

.[length(.)] %>%

490

grepl("soft",.)

490

grepl("soft",.)

491

492

##Changing row names and column names:

492

##Changing row names and column names:

493

ALZWORD <- t(alzword)

493

ALZWORD <- t(alzword)

494

rownames(ALZWORD)=NULL

494

rownames(ALZWORD)=NULL

495

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

495

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

496

ALZWORD <- chngrownm(ALZWORD)[-1,]

496

ALZWORD <- chngrownm(ALZWORD)[-1,]

497

ALZWORD <- ALZWORD%>%

497

ALZWORD <- ALZWORD%>%

498

as.data.frame()%>%

498

as.data.frame()%>%

499

dplyr::select(-starts_with("col"))

499

dplyr::select(-starts_with("col"))

500

501

##Reorganizing information within the columns and final clinical data

501

##Reorganizing information within the columns and final clinical data

502

ALZWORDF <- cinfo(ALZWORD)

502

ALZWORDF <- cinfo(ALZWORD)

503

504

505

#Working with Actual Data part of file

505

#Working with Actual Data part of file

506

alzdat <- alz %>%

506

alzdat <- alz %>%

507

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

507

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

508

ALZDAT <- t(alzdat[,-1])

508

ALZDAT <- t(alzdat[,-1])

509

rownames(ALZDAT)=NULL

509

rownames(ALZDAT)=NULL

510

511

##Is there a clean version of the GPL file available?

511

##Is there a clean version of the GPL file available?

512

gplnum <- strsplit(genena,"[\\|/]") %>%

512

gplnum <- strsplit(genena,"[\\|/]") %>%

513

.[[1]] %>%

513

.[[1]] %>%

514

.[length(.)] %>%

514

.[length(.)] %>%

515

gsub("\\D","",.)

515

gsub("\\D","",.)

516

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

516

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

517

if(clfileex >= 1){

517

if(clfileex >= 1){

518

#use the clean version

518

#use the clean version

519

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

519

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

520

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

520

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

521

522

}

522

}

523

if(clfileex == 0){

523

if(clfileex == 0){

524

##Lets Create a clean version

524

##Lets Create a clean version

525

526

##Gene ID to Gene Name

526

##Gene ID to Gene Name

527

if(soft == TRUE){

527

if(soft == TRUE){

528

#Check to see if there is already a file containing information on soft files

528

#Check to see if there is already a file containing information on soft files

529

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

529

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

530

if(fileex == 1){

530

if(fileex == 1){

531

#Check to see if this GPL soft file has been used before

531

#Check to see if this GPL soft file has been used before

532

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

532

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

533

.$GPL_FILE_NUM%>%

533

.$GPL_FILE_NUM%>%

534

grepl(gplnum,.) %>%

534

grepl(gplnum,.) %>%

535

sum()

535

sum()

536

if(IDF == 1){

536

if(IDF == 1){

537

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

537

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

538

.$GPL_FILE_NUM%>%

538

.$GPL_FILE_NUM%>%

539

grep(gplnum,.)

539

grep(gplnum,.)

540

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

540

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

541

.$LOC_ID %>%

541

.$LOC_ID %>%

542

.[IDLOCAL]

542

.[IDLOCAL]

543

geneIDNam <- genena %>%

543

geneIDNam <- genena %>%

544

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

544

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

545

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

545

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

546

}

546

}

547

if(IDF == 0){

547

if(IDF == 0){

548

#No information on this particular GPL file

548

#No information on this particular GPL file

549

idLOCGPL <- genena %>%

549

idLOCGPL <- genena %>%

550

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

550

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

551

t(.) %>%

551

t(.) %>%

552

grep("^ID\\s*$",.) %>%

552

grep("^ID\\s*$",.) %>%

553

-1

553

-1

554

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

554

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

555

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

555

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

556

geneIDNam <- genena %>%

556

geneIDNam <- genena %>%

557

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

557

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

558

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

558

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

559

}

559

}

560

}

560

}

561

if(fileex == 0){

561

if(fileex == 0){

562

#We must create a file that we can access for later use

562

#We must create a file that we can access for later use

563

idLOCGPL <- genena %>%

563

idLOCGPL <- genena %>%

564

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

564

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

565

t(.) %>%

565

t(.) %>%

566

grep("^ID\\s*$",.) %>%

566

grep("^ID\\s*$",.) %>%

567

-1

567

-1

568

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

568

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

569

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

569

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

570

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

570

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

571

geneIDNam <- genena %>%

571

geneIDNam <- genena %>%

572

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

572

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

573

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

573

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

574

}

574

}

575

}

575

}

576

if(soft == FALSE){

576

if(soft == FALSE){

577

geneIDNam <- genena %>%

577

geneIDNam <- genena %>%

578

read_delim(delim="\t",comment = "#")%>%

578

read_delim(delim="\t",comment = "#")%>%

579

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

579

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

580

}

580

}

581

582

##Labeling the gene IDs without names

582

##Labeling the gene IDs without names

583

geneIDNam <- NAFIXING(geneIDNam)

583

geneIDNam <- NAFIXING(geneIDNam)

584

585

##remove the whitespace

585

##remove the whitespace

586

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

586

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

587

588

##Here is the clean version

588

##Here is the clean version

589

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

589

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

590

}

590

}

591

592

593

594

##Changing the gene ID to gene name

594

##Changing the gene ID to gene name

595

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

595

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

596

colnames(ALZDAT) = ALZDAT1[1,]

596

colnames(ALZDAT) = ALZDAT1[1,]

597

598

599

##Adjusting the column names aka the gene names

599

##Adjusting the column names aka the gene names

600

colnames(ALZDAT) <- gcnames(ALZDAT)

600

colnames(ALZDAT) <- gcnames(ALZDAT)

601

602

603

#Full RAW Data

603

#Full RAW Data

604

Fullalzdwr <- ALZDAT %>%

604

Fullalzdwr <- ALZDAT %>%

605

as.data.frame() %>%

605

as.data.frame() %>%

606

cbind(ALZWORDF,.)

606

cbind(ALZWORDF,.)

607

608

#Raw file is output

608

#Raw file is output

609

nfnaex <- strsplit(alz,"[\\]") %>%

609

nfnaex <- strsplit(alz,"[\\]") %>%

610

.[[1]] %>%

610

.[[1]] %>%

611

.[length(.)] %>%

611

.[length(.)] %>%

612

gsub("\\D","",.) %>%

612

gsub("\\D","",.) %>%

613

c("GSE",.,"aftexcel.txt") %>%

613

c("GSE",.,"aftexcel.txt") %>%

614

paste(collapse = "")

614

paste(collapse = "")

615

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

615

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

616

617

618

619

#Now for the discretization part

619

#Now for the discretization part

620

##get the wordy part again

620

##get the wordy part again

621

rawword <- t(ALZWORDF)

621

rawword <- t(ALZWORDF)

622

623

##where is ID_REF located

623

##where is ID_REF located

624

hereim <- grep("ID_REF",rownames(rawword))

624

hereim <- grep("ID_REF",rownames(rawword))

625

626

##Subject Names GSM...

626

##Subject Names GSM...

627

subjnam <- rawword[hereim,]

627

subjnam <- rawword[hereim,]

628

629

##Getting the names for the rows

629

##Getting the names for the rows

630

namedarows <- rownames(rawword)[-hereim] %>%

630

namedarows <- rownames(rawword)[-hereim] %>%

631

as.data.frame()

631

as.data.frame()

632

RAWWORD <- rawword[-hereim,] %>%

632

RAWWORD <- rawword[-hereim,] %>%

633

as.data.frame() %>%

633

as.data.frame() %>%

634

bind_cols(namedarows,.)

634

bind_cols(namedarows,.)

635

z <- 1

635

z <- 1

636

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

636

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

637

for(z in 1:dim(RAWWORD)[1]){

637

for(z in 1:dim(RAWWORD)[1]){

638

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

638

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

639

z <- z + 1

639

z <- z + 1

640

}

640

}

641

642

colnames(naroww) <- "ROW_NAs"

642

colnames(naroww) <- "ROW_NAs"

643

RAWWORD <- bind_cols(RAWWORD,naroww)

643

RAWWORD <- bind_cols(RAWWORD,naroww)

644

645

646

roALZna <- t(ALZDAT) %>%

646

roALZna <- t(ALZDAT) %>%

647

rownames(.) %>%

647

rownames(.) %>%

648

as.data.frame(.)

648

as.data.frame(.)

649

colnames(roALZna) <- "ID_REF"

649

colnames(roALZna) <- "ID_REF"

650

651

RAWDAT <- t(ALZDAT) %>%

651

RAWDAT <- t(ALZDAT) %>%

652

as.data.frame(.)

652

as.data.frame(.)

653

colnames(RAWDAT) <- NULL

653

colnames(RAWDAT) <- NULL

654

rownames(RAWDAT) <- NULL

654

rownames(RAWDAT) <- NULL

655

656

RAWDAT2 <- RAWDAT %>%

656

RAWDAT2 <- RAWDAT %>%

657

cbind(roALZna,.) %>%

657

cbind(roALZna,.) %>%

658

dplyr::arrange(.,ID_REF)

658

dplyr::arrange(.,ID_REF)

659

660

##Editing the file for R processing

660

##Editing the file for R processing

661

RAWDATID <- RAWDAT2[,1] %>%

661

RAWDATID <- RAWDAT2[,1] %>%

662

as.matrix(.)

662

as.matrix(.)

663

664

RAWDATNUM <- RAWDAT2[,-1] %>%

664

RAWDATNUM <- RAWDAT2[,-1] %>%

665

mapply(.,FUN = as.numeric) %>%

665

mapply(.,FUN = as.numeric) %>%

666

t(.)

666

t(.)

667

668

##Consolidating genes with the same name

668

##Consolidating genes with the same name

669

###create empty matrix of size equal to tabRDATID

669

###create empty matrix of size equal to tabRDATID

670

tabRDATID <- table(RAWDATID)

670

tabRDATID <- table(RAWDATID)

671

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

671

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

672

j <- 1

672

j <- 1

673

for(j in 1:length(tabRDATID)){

673

for(j in 1:length(tabRDATID)){

674

##Putting the ones without duplicates in their new homes

674

##Putting the ones without duplicates in their new homes

675

if(tabRDATID[j] == 1){

675

if(tabRDATID[j] == 1){

676

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

676

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

677

}

677

}

678

##Averaging duplicates and putting them in their new homes

678

##Averaging duplicates and putting them in their new homes

679

if(tabRDATID[j] > 1){

679

if(tabRDATID[j] > 1){

680

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

680

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

681

}

681

}

682

j <- j + 1

682

j <- j + 1

683

}

683

}

684

685

##Scaling the Data

685

##Scaling the Data

686

scrawdat <- NuRDATN%>%

686

scrawdat <- NuRDATN%>%

687

scale()

687

scale()

688

attr(scrawdat,"scaled:center") <- NULL

688

attr(scrawdat,"scaled:center") <- NULL

689

attr(scrawdat,"scaled:scale") <- NULL

689

attr(scrawdat,"scaled:scale") <- NULL

690

colnames(scrawdat) <- rownames(tabRDATID)

690

colnames(scrawdat) <- rownames(tabRDATID)

691

692

##Discretized the Data

692

##Discretized the Data

693

dialzdat <- scrawdat %>%

693

dialzdat <- scrawdat %>%

694

dndat(.) %>%

694

dndat(.) %>%

695

t()%>%

695

t()%>%

696

as.data.frame(.)

696

as.data.frame(.)

697

colnames(dialzdat) <- rownames(RAWDATNUM)

697

colnames(dialzdat) <- rownames(RAWDATNUM)

698

699

##setting "ID_REF" as a new variable

699

##setting "ID_REF" as a new variable

700

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

700

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

701

colnames(geneNAM) <- "ID_REF"

701

colnames(geneNAM) <- "ID_REF"

702

rownames(dialzdat) <- NULL

702

rownames(dialzdat) <- NULL

703

dialzdat <-bind_cols(geneNAM,dialzdat)

703

dialzdat <-bind_cols(geneNAM,dialzdat)

704

705

##NAs in a column

705

##NAs in a column

706

x <- 2

706

x <- 2

707

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

707

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

708

nacol[1,1] = "COL_NAs"

708

nacol[1,1] = "COL_NAs"

709

for(x in 2:dim(dialzdat)[2]){

709

for(x in 2:dim(dialzdat)[2]){

710

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

710

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

711

x <- x + 1

711

x <- x + 1

712

}

712

}

713

colnames(nacol) <- colnames(dialzdat)

713

colnames(nacol) <- colnames(dialzdat)

714

dialzdat <- bind_rows(dialzdat,nacol)

714

dialzdat <- bind_rows(dialzdat,nacol)

715

716

##NAs in a row

716

##NAs in a row

717

y <- 1

717

y <- 1

718

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

718

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

719

for(y in 1:dim(dialzdat)[1]){

719

for(y in 1:dim(dialzdat)[1]){

720

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

720

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

721

y <- y + 1

721

y <- y + 1

722

}

722

}

723

colnames(narowd) <- "ROW_NAs"

723

colnames(narowd) <- "ROW_NAs"

724

dialzdat <- bind_cols(dialzdat,narowd)

724

dialzdat <- bind_cols(dialzdat,narowd)

725

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

725

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

726

colnames(RAWWORD) <- colnames(dialzdat)

726

colnames(RAWWORD) <- colnames(dialzdat)

727

##converting to character so that the clinical can be brought together with discrete data

727

##converting to character so that the clinical can be brought together with discrete data

728

k <- 2

728

k <- 2

729

for(k in 2:dim(dialzdat)[2]-1){

729

for(k in 2:dim(dialzdat)[2]-1){

730

dialzdat[,k] <- as.character(dialzdat[,k])

730

dialzdat[,k] <- as.character(dialzdat[,k])

731

k <- k + 1

731

k <- k + 1

732

}

732

}

733

#The End the full data

733

#The End the full data

734

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

734

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

735

736

#Produces Discrete file

736

#Produces Discrete file

737

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

737

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

738

.[[1]] %>%

738

.[[1]] %>%

739

.[length(.)] %>%

739

.[length(.)] %>%

740

gsub("\\D","",.) %>%

740

gsub("\\D","",.) %>%

741

c("GSE",.,"dscrt.txt") %>%

741

c("GSE",.,"dscrt.txt") %>%

742

paste(collapse = "")

742

paste(collapse = "")

743

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

743

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

744

745

746

n <- n + 1

746

n <- n + 1

747

}

747

}

748

}

748

}

749

}

749

}

750

}

750

}

751

#The Rest of this code will be used every time you want to change a data set

751

#The Rest of this code will be used every time you want to change a data set

752

THEFT()

752

THEFT()

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

An automated version of RCleanDscret.R