Efrain Gonzalez / Cleaning and Fixing Data with R

1

##Posted 6/15/2017

1

##Posted 6/15/2017

2

3

4

#Libraries required to run the code

4

#Libraries required to run the code

5

library(pryr)

5

library(pryr)

6

library(MASS)

6

library(MASS)

7

library(dplyr)

7

library(dplyr)

8

library(tidyr)

8

library(tidyr)

9

library(readr)

9

library(readr)

10

library(stringr)

10

library(stringr)

11

12

13

#Necessary Functions

13

#Necessary Functions

14

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

15

chngrownm <- function(mat){

15

chngrownm <- function(mat){

16

row <- dim(mat)[1]

16

row <- dim(mat)[1]

17

col <- dim(mat)[2]

17

col <- dim(mat)[2]

18

j <- 1

18

j <- 1

19

x <- 1

19

x <- 1

20

p <- 1

20

p <- 1

21

a <- 1

21

a <- 1

22

b <- 1

22

b <- 1

23

g <- 1

23

g <- 1

24

for(j in 1:col){

24

for(j in 1:col){

25

if("!Sample_source_name_ch1"==mat[1,j]){

25

if("!Sample_source_name_ch1"==mat[1,j]){

26

colnames(mat)[j] <- "Brain_Region"

26

colnames(mat)[j] <- "Brain_Region"

27

}

27

} else if("!Sample_title" == mat[1,j]){

28

if("!Sample_title" == mat[1,j]){

29

colnames(mat)[j] <- "Title"

28

colnames(mat)[j] <- "Title"

30

}

29

} else if("!Sample_geo_accession" == mat[1,j]){

31

if("!Sample_geo_accession" == mat[1,j]){

32

colnames(mat)[j] <- "ID_REF"

30

colnames(mat)[j] <- "ID_REF"

33

} else{

31

} else{

34

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

35

colnames(mat)[j] <- paste0("Sex",x)

33

colnames(mat)[j] <- paste0("Sex",x)

36

x = x + 1

34

x = x + 1

37

}

35

}

38

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

39

colnames(mat)[j] <- paste0("PMI",p)

37

colnames(mat)[j] <- paste0("PMI",p)

40

p = p + 1

38

p = p + 1

41

}

39

}

42

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

43

colnames(mat)[j] <- paste0("Age",a)

41

colnames(mat)[j] <- paste0("Age",a)

44

a = a + 1

42

a = a + 1

45

}

43

}

46

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

47

colnames(mat)[j] <- paste0("Braak",b)

45

colnames(mat)[j] <- paste0("Braak",b)

48

b = b + 1

46

b = b + 1

49

}

47

}

50

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

51

colnames(mat)[j] <- paste0("Group",g)

49

colnames(mat)[j] <- paste0("Group",g)

52

g = g + 1

50

g = g + 1

53

}

51

}

54

52

55

}

53

}

56

j = j + 1

54

j = j + 1

57

}

55

}

58

mat

56

mat

59

}

57

}

60

58

61

#2#Function for reorganizing information within the columns

59

#2#Function for reorganizing information within the columns

62

cinfo <- function(mat){

60

cinfo <- function(mat){

63

col <- dim(mat)[2]

61

col <- dim(mat)[2]

64

j <-2

62

j <-2

65

for(j in 2:col){

63

for(j in 2:col){

66

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

67

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

68

}

66

}

69

if(grepl("Age",colnames(mat)[j])==TRUE){

67

if(grepl("Age",colnames(mat)[j])==TRUE){

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

71

as.integer()

69

as.integer()

72

}

70

}

73

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

75

}

73

}

76

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

77

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

78

as.numeric()

76

as.numeric()

79

}

77

}

80

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

81

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

82

as.roman()%>%

80

as.roman()%>%

83

as.integer()

81

as.integer()

84

}

82

}

85

j=j+1

83

j=j+1

86

}

84

}

87

mat

85

mat

88

}

86

}

89

87

90

#3#Function for labeling the gene IDs without names

88

#3#Function for labeling the gene IDs without names

91

NAFIXING <- function(GIDNAM){

89

NAFIXING <- function(GIDNAM){

92

row <- dim(GIDNAM)[1]

90

row <- dim(GIDNAM)[1]

93

i <- 1

91

i <- 1

94

for(i in 1:row){

92

for(i in 1:row){

95

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

96

GIDNAM[i,2] <- GIDNAM[i,1]

94

GIDNAM[i,2] <- GIDNAM[i,1]

97

}

95

}

98

i <- i + 1

96

i <- i + 1

99

}

97

}

100

GIDNAM

98

GIDNAM

101

}

99

}

102

100

103

#4#Function for changing the gene ID to gene name

101

#4#Function for changing the gene ID to gene name

104

cgeneID <- function(GeneName,DATA){

102

cgeneID <- function(GeneName,DATA){

105

colGene <- dim(GeneName)[2]

103

nj <- t(GeneName)

106

j <- 1

104

nq <- t(DATA)

107

for(j in 1:colGene){

105

colGene <- dim(nj)[2]

108

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

106

colDATA <- dim(nq)[2]

109

if(is.na(sum(chngsreq))==FALSE){

107

j <- 1

110

if(sum(chngsreq) > 0){

108

for(j in 1:colDATA){

111

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

109

#where is that gene id located within the GPL file

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

111

if(is.na(sum(chngreq))==FALSE){

112

if(sum(chngreq) > 0){

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

114

}

112

}

115

}

116

j <- j + 1

113

}

117

}

114

#if(sum(chngsreq) > 0){

118

nq

115

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

116

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

117

#}

118

j = j+1

119

}

120

DATA

121

}

119

}

120

#cgeneID <- function(GeneName,DATA){

121

# colGene <- dim(GeneName)[2]

122

# j <- 1

123

# for(j in 1:colGene){

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

125

# if(is.na(sum(chngsreq))==FALSE){

126

# if(sum(chngsreq) > 0){

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

128

# }

129

# }

130

# #if(sum(chngsreq) > 0){

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# #}

134

# j = j+1

135

# }

136

# DATA

137

#}

122

138

123

#5#Function for adjusting the gene names

139

#5#Function for adjusting the gene names

124

gcnames <- function(DiData,usecol=1){

140

gcnames <- function(DiData,usecol=1){

125

nuruns <- dim(DiData)[2]

141

nuruns <- dim(DiData)[2]

126

i = 1

142

i = 1

127

nwnam <- rep("0",length.out=nuruns)

143

nwnam <- rep("0",length.out=nuruns)

128

for(i in 1:nuruns){

144

for(i in 1:nuruns){

129

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

145

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

130

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

131

} else{

147

} else{

132

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

133

}

149

}

134

150

135

}

151

}

136

nwnam

152

nwnam

137

153

138

}

154

}

139

155

140

#6# Function for discretizing the data

156

#6# Function for discretizing the data

141

dndat <- function(NDATA){

157

dndat <- function(NDATA){

142

rownd <- dim(NDATA)[1]

158

rownd <- dim(NDATA)[1]

143

colnd <- dim(NDATA)[2]

159

colnd <- dim(NDATA)[2]

144

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

145

colnames(DDATA) <- colnames(NDATA)

161

colnames(DDATA) <- colnames(NDATA)

146

i <- 1

162

i <- 1

147

for(i in 1:rownd){

163

for(i in 1:rownd){

148

j <- 1

164

j <- 1

149

for(j in 1:colnd){

165

for(j in 1:colnd){

150

if(is.na(NDATA[i,j])==FALSE){

166

if(is.na(NDATA[i,j])==FALSE){

151

167

152

if(NDATA[i,j] < -1){

168

if(NDATA[i,j] < -1){

153

DDATA[i,j]=0L

169

DDATA[i,j]=0L

154

}

170

} else if(NDATA[i,j] > 1){

155

if(NDATA[i,j] > 1){

156

DDATA[i,j]=2L

171

DDATA[i,j]=2L

157

}

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

158

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

159

DDATA[i,j]=1L

173

DDATA[i,j]=1L

160

}

174

}

161

} else{

175

} else{

162

DDATA[i,j] = NDATA[i,j]

176

DDATA[i,j] = NDATA[i,j]

163

}

177

}

164

j = j + 1

178

j = j + 1

165

}

179

}

166

i = i + 1

180

i = i + 1

167

}

181

}

168

DDATA

182

DDATA

169

}

183

}

170

184

171

185

172

#The Rest of this code will be used every time you want to change a data set

186

#The Rest of this code will be used every time you want to change a data set

173

187

174

#Getting the series matrix file

188

#Getting the series matrix file

175

print("Choose the series matrix file that you want to Analyze")

189

print("Choose the series matrix file that you want to Analyze")

176

alz <- file.choose()

190

alz <- file.choose()

177

191

178

#Getting the GPL file

192

#Getting the GPL file

179

print("Choose the GPL file that correlates with the above series matrix file")

193

print("Choose the GPL file that correlates with the above series matrix file")

180

genena <- file.choose()

194

genena <- file.choose()

181

195

182

196

183

#Find out if it is a soft GPL file or not

197

#Find out if it is a soft GPL file or not

184

soft <- strsplit(genena,"[\\|/]") %>%

198

soft <- strsplit(genena,"[\\|/]") %>%

185

.[[1]] %>%

199

.[[1]] %>%

186

.[length(.)] %>%

200

.[length(.)] %>%

187

grepl("soft|annot",.)

201

grepl("soft|annot",.)

188

202

189

#Working with the wordy part of the document

203

#Working with the wordy part of the document

190

alzword <- alz %>%

204

alzword <- alz %>%

191

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

192

filter(grepl("!Sample",X1))%>%

206

filter(grepl("!Sample",X1))%>%

193

filter(!grepl("!Sample_contact",X1))

207

filter(!grepl("!Sample_contact",X1))

194

208

195

##Changing row names and column names:

209

##Changing row names and column names:

196

ALZWORD <- t(alzword)

210

ALZWORD <- t(alzword)

197

rownames(ALZWORD)=NULL

211

rownames(ALZWORD)=NULL

198

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

199

ALZWORD <- chngrownm(ALZWORD)[-1,]

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

200

ALZWORD <- ALZWORD%>%

214

ALZWORD <- ALZWORD%>%

201

as.data.frame()%>%

215

as.data.frame()%>%

202

dplyr::select(-starts_with("col"))

216

dplyr::select(-starts_with("col"))

203

217

204

##Reorganizing information within the columns

218

##Reorganizing information within the columns

205

ALZWORDF <- cinfo(ALZWORD)

219

ALZWORDF <- cinfo(ALZWORD)

206

220

207

221

208

#Working with Actual Data part of file

222

#Working with Actual Data part of file

209

alzdat <- alz %>%

223

alzdat <- alz %>%

210

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

211

ALZDAT <- t(alzdat[,-1])

225

ALZDAT <- t(alzdat[,-1])

212

rownames(ALZDAT)=NULL

226

rownames(ALZDAT)=NULL

213

227

214

##Is there a clean version of the GPL file available?

228

##Is there a clean version of the GPL file available?

215

gplnum <- strsplit(genena,"[\\|/]") %>%

229

gplnum <- strsplit(genena,"[\\|/]") %>%

216

.[[1]] %>%

230

.[[1]] %>%

217

.[length(.)] %>%

231

.[length(.)] %>%

218

gsub("\\D","",.)

232

gsub("\\D","",.)

219

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

220

if(clfileex >= 1){

234

if(clfileex >= 1){

221

#use the clean version

235

#use the clean version

222

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

223

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

224

238

225

}

239

} else if(clfileex == 0){

226

if(clfileex == 0){

227

##Lets Create a clean version

240

##Lets Create a clean version

228

241

229

##Gene ID to Gene Name

242

##Gene ID to Gene Name

230

if(soft == TRUE){

243

if(soft == TRUE){

231

#Check to see if there is already a file containing information on soft files

244

#Check to see if there is already a file containing information on soft files

232

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

233

if(fileex == 1){

246

if(fileex == 1){

234

#Check to see if this GPL soft file has been used before

247

#Check to see if this GPL soft file has been used before

235

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

236

.$GPL_FILE_NUM%>%

249

.$GPL_FILE_NUM%>%

237

grepl(gplnum,.) %>%

250

grepl(gplnum,.) %>%

238

sum()

251

sum()

239

if(IDF == 1){

252

if(IDF == 1){

240

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

241

.$GPL_FILE_NUM%>%

254

.$GPL_FILE_NUM%>%

242

grep(gplnum,.)

255

grep(gplnum,.)

243

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

244

.$LOC_ID %>%

257

.$LOC_ID %>%

245

.[IDLOCAL]

258

.[IDLOCAL]

246

geneIDNam <- genena %>%

259

geneIDNam <- genena %>%

247

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

248

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

249

}

262

} else if(IDF == 0){

250

if(IDF == 0){

251

#No information on this particular GPL file

263

#No information on this particular GPL file

252

idLOCGPL <- genena %>%

264

idLOCGPL <- genena %>%

253

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

254

t(.) %>%

266

t(.) %>%

255

grep("^ID\\s*$",.) %>%

267

grep("^ID\\s*$",.) %>%

256

-1

268

-1

257

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

258

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

259

geneIDNam <- genena %>%

271

geneIDNam <- genena %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

262

}

274

}

263

}

275

} else if(fileex == 0){

264

if(fileex == 0){

265

#We must create a file that we can access for later use

276

#We must create a file that we can access for later use

266

idLOCGPL <- genena %>%

277

idLOCGPL <- genena %>%

267

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

268

t(.) %>%

279

t(.) %>%

269

grep("^ID\\s*$",.) %>%

280

grep("^ID\\s*$",.) %>%

270

-1

281

-1

271

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

272

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

273

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

274

geneIDNam <- genena %>%

285

geneIDNam <- genena %>%

275

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

276

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

287

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

277

}

288

}

278

}

289

} else if(soft == FALSE){

279

if(soft == FALSE){

280

geneIDNam <- genena %>%

290

geneIDNam <- genena %>%

281

read_delim(delim="\t",comment = "#")%>%

291

read_delim(delim="\t",comment = "#")%>%

282

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

292

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

283

}

293

}

284

294

285

##Labeling the gene IDs without names

295

##Labeling the gene IDs without names

286

geneIDNam <- NAFIXING(geneIDNam)

296

geneIDNam <- NAFIXING(geneIDNam)

287

297

288

##remove the whitespace

298

##remove the whitespace

289

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

290

300

291

##Here is the clean version

301

##Here is the clean version

292

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

293

}

303

}

294

304

295

305

296

306

297

##Changing the gene ID to gene name

307

##Changing the gene ID to gene name

298

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

299

colnames(ALZDAT) = ALZDAT1[1,]

309

colnames(ALZDAT) = ALZDAT1[1,]

300

310

301

311

302

##Adjusting the column names aka the gene names

312

##Adjusting the column names aka the gene names

303

colnames(ALZDAT) <- gcnames(ALZDAT)

313

colnames(ALZDAT) <- gcnames(ALZDAT)

304

314

305

315

306

#Full RAW Data

316

#Full RAW Data

307

Fullalzdwr <- ALZDAT %>%

317

Fullalzdwr <- ALZDAT %>%

308

as.data.frame() %>%

318

as.data.frame() %>%

309

cbind(ALZWORDF,.)

319

cbind(ALZWORDF,.)

310

320

311

321

312

#Raw file is output

322

#Raw file is output

313

nfnaex <- strsplit(alz,"[\\]") %>%

323

nfnaex <- strsplit(alz,"[\\]") %>%

314

.[[1]] %>%

324

.[[1]] %>%

315

.[length(.)] %>%

325

.[length(.)] %>%

316

gsub("\\D","",.) %>%

326

gsub("\\D","",.) %>%

317

c("GSE",.,"aftexcel.txt") %>%

327

c("GSE",.,"aftexcel.txt") %>%

318

paste(collapse = "")

328

paste(collapse = "")

319

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

320

330

321

331

322

#Now for the discretization part

332

#Now for the discretization part

323

##get the wordy part again

333

##get the wordy part again

324

rawword <- t(ALZWORDF)

334

rawword <- t(ALZWORDF)

325

335

326

##where is ID_REF located

336

##where is ID_REF located

327

hereim <- grep("ID_REF",rownames(rawword))

337

hereim <- grep("ID_REF",rownames(rawword))

328

338

329

##Subject Names GSM...

339

##Subject Names GSM...

330

subjnam <- rawword[hereim,]

340

subjnam <- rawword[hereim,]

331

341

332

##Getting the names for the rows

342

##Getting the names for the rows

333

namedarows <- rownames(rawword)[-hereim] %>%

343

namedarows <- rownames(rawword)[-hereim] %>%

334

as.data.frame()

344

as.data.frame()

335

RAWWORD <- rawword[-hereim,] %>%

345

RAWWORD <- rawword[-hereim,] %>%

336

as.data.frame() %>%

346

as.data.frame() %>%

337

bind_cols(namedarows,.)

347

bind_cols(namedarows,.)

338

z <- 1

348

z <- 1

339

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

340

for(z in 1:dim(RAWWORD)[1]){

350

for(z in 1:dim(RAWWORD)[1]){

341

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

351

if(sum(is.na(RAWWORD[z,])) > 0){

342

z <- z + 1

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

353

}

354

if(length(grep("NA",RAWWORD[z,])) > 0){

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

356

}

357

z <- z + 1

343

}

358

}

344

359

345

colnames(naroww) <- "ROW_NAs"

360

colnames(naroww) <- "ROW_NAs"

346

RAWWORD <- bind_cols(RAWWORD,naroww)

361

RAWWORD <- bind_cols(RAWWORD,naroww)

347

362

348

363

349

roALZna <- t(ALZDAT) %>%

364

roALZna <- t(ALZDAT) %>%

350

rownames(.) %>%

365

rownames(.) %>%

351

as.data.frame(.)

366

as.data.frame(.)

352

colnames(roALZna) <- "ID_REF"

367

colnames(roALZna) <- "ID_REF"

353

368

354

RAWDAT <- t(ALZDAT) %>%

369

RAWDAT <- t(ALZDAT) %>%

355

as.data.frame(.)

370

as.data.frame(.)

356

colnames(RAWDAT) <- NULL

371

colnames(RAWDAT) <- NULL

357

rownames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

358

373

359

RAWDAT2 <- RAWDAT %>%

374

RAWDAT2 <- RAWDAT %>%

360

cbind(roALZna,.) %>%

375

cbind(roALZna,.) %>%

361

dplyr::arrange(.,ID_REF)

376

dplyr::arrange(.,ID_REF)

362

377

363

##Editing the file for R processing

378

##Editing the file for R processing

364

RAWDATID <- RAWDAT2[,1] %>%

379

RAWDATID <- RAWDAT2[,1] %>%

365

as.matrix(.)

380

as.matrix(.)

366

381

367

RAWDATNUM <- RAWDAT2[,-1] %>%

382

RAWDATNUM <- RAWDAT2[,-1] %>%

368

mapply(.,FUN = as.numeric) %>%

383

mapply(.,FUN = as.numeric) %>%

369

t(.)

384

t(.)

370

385

371

##Consolidating genes with the same name

386

##Consolidating genes with the same name

372

###create empty matrix of size equal to tabRDATID

387

###create empty matrix of size equal to tabRDATID

373

tabRDATID <- table(RAWDATID)

388

tabRDATID <- table(RAWDATID)

374

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

375

j <- 1

390

j <- 1

376

for(j in 1:length(tabRDATID)){

391

for(j in 1:length(tabRDATID)){

377

392

378

##Putting the ones without duplicates in their new homes

393

##Putting the ones without duplicates in their new homes

379

if(tabRDATID[j] == 1){

394

if(tabRDATID[j] == 1){

380

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

381

}

396

} else if(tabRDATID[j] > 1){

382

##Averaging duplicates and putting them in their new homes

397

##Averaging duplicates and putting them in their new homes

383

if(tabRDATID[j] > 1){

384

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

385

}

399

}

386

j <- j + 1

400

j <- j + 1

387

}

401

}

388

402

389

##Scaling the Data

403

##Scaling the Data

390

scrawdat <- NuRDATN%>%

404

scrawdat <- NuRDATN%>%

391

scale()

405

scale()

392

attr(scrawdat,"scaled:center") <- NULL

406

attr(scrawdat,"scaled:center") <- NULL

393

attr(scrawdat,"scaled:scale") <- NULL

407

attr(scrawdat,"scaled:scale") <- NULL

394

colnames(scrawdat) <- rownames(tabRDATID)

408

colnames(scrawdat) <- rownames(tabRDATID)

395

409

396

##Discretized the Data

410

##Discretized the Data

397

dialzdat <- scrawdat %>%

411

dialzdat <- scrawdat %>%

398

dndat(.) %>%

412

dndat(.) %>%

399

t()%>%

413

t()%>%

400

as.data.frame(.)

414

as.data.frame(.)

401

colnames(dialzdat) <- rownames(RAWDATNUM)

415

colnames(dialzdat) <- rownames(RAWDATNUM)

402

416

403

##setting "ID_REF" as a new variable

417

##setting "ID_REF" as a new variable

404

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

418

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

405

colnames(geneNAM) <- "ID_REF"

419

colnames(geneNAM) <- "ID_REF"

406

rownames(dialzdat) <- NULL

420

rownames(dialzdat) <- NULL

407

dialzdat <-bind_cols(geneNAM,dialzdat)

421

dialzdat <-bind_cols(geneNAM,dialzdat)

408

422

409

##NAs in a column

423

##NAs in a column

410

x <- 2

424

x <- 2

411

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

425

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

412

nacol[1,1] = "COL_NAs"

426

nacol[1,1] = "COL_NAs"

413

for(x in 2:dim(dialzdat)[2]){

427

for(x in 2:dim(dialzdat)[2]){

414

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

428

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

415

x <- x + 1

429

x <- x + 1

416

}

430

}

417

colnames(nacol) <- colnames(dialzdat)

431

colnames(nacol) <- colnames(dialzdat)

418

dialzdat<-bind_rows(dialzdat,nacol)

432

dialzdat<-bind_rows(dialzdat,nacol)

419

433

420

##NAs in a row

434

##NAs in a row

421

y <- 1

435

y <- 1

422

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

436

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

423

for(y in 1:dim(dialzdat)[1]){

437

for(y in 1:dim(dialzdat)[1]){

424

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

438

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

425

y <- y + 1

439

y <- y + 1

426

}

440

}

427

colnames(narowd) <- "ROW_NAs"

441

colnames(narowd) <- "ROW_NAs"

428

dialzdat <- bind_cols(dialzdat,narowd)

442

dialzdat <- bind_cols(dialzdat,narowd)

429

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

443

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

430

colnames(RAWWORD) <- colnames(dialzdat)

444

colnames(RAWWORD) <- colnames(dialzdat)

431

##converting to character so that the clinical can be brought together with discrete data

445

##converting to character so that the clinical can be brought together with discrete data

432

k <- 2

446

k <- 2

433

for(k in 2:dim(dialzdat)[2]-1){

447

for(k in 2:dim(dialzdat)[2]-1){

434

dialzdat[,k] <- as.character(dialzdat[,k])

448

dialzdat[,k] <- as.character(dialzdat[,k])

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Most recent update fixed a few handling errors

 ##Posted 6/15/2017
 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	j <- 1
 	x <- 1
 	p <- 1
 	a <- 1
 	b <- 1
 	g <- 1
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"
-		}
+		} else if("!Sample_title" == mat[1,j]){
-		if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
-		}
+		} else if("!Sample_geo_accession" == mat[1,j]){
-		if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Sex",x)
 				x = x + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("PMI",p)
 				p = p + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Age",a)
 				a = a + 1
 			 }
 			if(grepl("braak|b&b",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Braak",b)
 				b = b + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Group",g)
 				g = g + 1
 			}
 		}
 		j = j + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
 		if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
 		if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
 		if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		}
 		if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
-    colGene <- dim(GeneName)[2]
+	nj <- t(GeneName)
-     j <- 1
+	nq <- t(DATA)
-     for(j in 1:colGene){
+	colGene <- dim(nj)[2]
-	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+	colDATA <- dim(nq)[2]
-	if(is.na(sum(chngsreq))==FALSE){
+	j <- 1
-		if(sum(chngsreq) > 0){
+	for(j in 1:colDATA){
-			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+		#where is that gene id located within the GPL file
+		chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])
+		if(is.na(sum(chngreq))==FALSE){
+			if(sum(chngreq) > 0){
+			nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])
+			}
 		}
+		j <- j + 1
 	}
-		#if(sum(chngsreq) > 0){
+	nq
-		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
-		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
-		#}
-	j = j+1
-	}
-	DATA
 }
+#cgeneID <- function(GeneName,DATA){
+#    colGene <- dim(GeneName)[2]
+#     j <- 1
+#     for(j in 1:colGene){
+#	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
+#	if(is.na(sum(chngsreq))==FALSE){
+#		if(sum(chngsreq) > 0){
+#			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		}
+#	}
+#		#if(sum(chngsreq) > 0){
+#		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
+#		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
+#		#}
+#	j = j+1
+#	}
+#	DATA
+#}
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
 		} else{
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
 		}
 	}
 	nwnam
 }
 #6# Function for discretizing the data
 dndat <- function(NDATA){
 	rownd <- dim(NDATA)[1]
 	colnd <- dim(NDATA)[2]
 	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
 	colnames(DDATA) <- colnames(NDATA)
 	i <- 1
 	for(i in 1:rownd){
 		j <- 1
 		for(j in 1:colnd){
 			if(is.na(NDATA[i,j])==FALSE){
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
-				}
+				} else if(NDATA[i,j] > 1){
-				if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
-				}
+				} else if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
-				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
 				DDATA[i,j] = NDATA[i,j]
 			}
 			j = j + 1
 		}
 		i = i + 1
 	}
 	DDATA
 }
 #The Rest of this code will be used every time you want to change a data set
 #Getting the series matrix file
 print("Choose the series matrix file that you want to Analyze")
 alz <- file.choose()
 #Getting the GPL file
 print("Choose the GPL file that correlates with the above series matrix file")
 genena <- file.choose()
 #Find out if it is a soft GPL file or not
 soft <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	grepl("soft|annot",.)
 #Working with the wordy part of the document
 alzword <- alz %>%
 	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 	filter(grepl("!Sample",X1))%>%
 	filter(!grepl("!Sample_contact",X1))
 ##Changing row names and column names:
 ALZWORD <- t(alzword)
 rownames(ALZWORD)=NULL
 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 ALZWORD <- chngrownm(ALZWORD)[-1,]
 ALZWORD <- ALZWORD%>%
 	as.data.frame()%>%
 	dplyr::select(-starts_with("col"))
 ##Reorganizing information within the columns
 ALZWORDF <- cinfo(ALZWORD)
 #Working with Actual Data part of file
 alzdat <- alz %>%
 	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 ALZDAT <- t(alzdat[,-1])
 rownames(ALZDAT)=NULL
 ##Is there a clean version of the GPL file available?
 gplnum <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.)
 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 if(clfileex >= 1){
 #use the clean version
 geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
-}
+} else if(clfileex == 0){
-if(clfileex == 0){
 ##Lets Create a clean version
 ##Gene ID to Gene Name
 	if(soft == TRUE){
 		#Check to see if there is already a file containing information on soft files
 		fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 		if(fileex == 1){
 			#Check to see if this GPL soft file has been used before
 			IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 				.$GPL_FILE_NUM%>%
 				grepl(gplnum,.) %>%
 				sum()
 			if(IDF == 1){
 				IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$GPL_FILE_NUM%>%
 					grep(gplnum,.)
 				idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$LOC_ID %>%
 					.[IDLOCAL]
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
-			}
+			} else if(IDF == 0){
-			if(IDF == 0){
 				#No information on this particular GPL file
 				idLOCGPL <- genena %>%
 					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 					t(.) %>%
 					grep("^ID\\s*$",.) %>%
 					-1
 				cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 					cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 			}
-		}
+		} else if(fileex == 0){
-		if(fileex == 0){
 			#We must create a file that we can access for later use
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 				t(.) %>%
 				grep("^ID\\s*$",.) %>%
 				-1
 			Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 			colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 			write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 			geneIDNam <- genena %>%
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 		}
-	 }
+	 } else if(soft == FALSE){
-	if(soft == FALSE){
 		geneIDNam <- genena %>%
 		read_delim(delim="\t",comment = "#")%>%
 		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 	}
 	##Labeling the gene IDs without names
 	geneIDNam <- NAFIXING(geneIDNam)
 	##remove the whitespace
 	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 	##Here is the clean version
 	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 }
 ##Changing the gene ID to gene name
-ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
+ALZDAT1 <- cgeneID(geneIDNam,alzdat)
 colnames(ALZDAT) = ALZDAT1[1,]
 ##Adjusting the column names aka the gene names
 colnames(ALZDAT) <- gcnames(ALZDAT)
 #Full RAW Data
 Fullalzdwr <- ALZDAT %>%
 	as.data.frame() %>%
 	cbind(ALZWORDF,.)
 #Raw file is output
 nfnaex <- strsplit(alz,"[\\]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"aftexcel.txt") %>%
 	paste(collapse = "")
 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 #Now for the discretization part
 ##get the wordy part again
 rawword <- t(ALZWORDF)
 ##where is ID_REF located
 hereim <- grep("ID_REF",rownames(rawword))
 ##Subject Names GSM...
 subjnam <- rawword[hereim,]
 ##Getting the names for the rows
 namedarows <- rownames(rawword)[-hereim] %>%
 	as.data.frame()
 RAWWORD <- rawword[-hereim,] %>%
 	as.data.frame() %>%
 	bind_cols(namedarows,.)
 z <- 1
 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 for(z in 1:dim(RAWWORD)[1]){
-	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+    if(sum(is.na(RAWWORD[z,])) > 0){
-	z <- z + 1
+        naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+    }
+	if(length(grep("NA",RAWWORD[z,])) > 0){
+        naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]
+    }
+    z <- z + 1
 }
 colnames(naroww) <- "ROW_NAs"
 RAWWORD <- bind_cols(RAWWORD,naroww)
 roALZna <- t(ALZDAT) %>%
 	rownames(.) %>%
 	as.data.frame(.)
 colnames(roALZna) <- "ID_REF"
 RAWDAT <- t(ALZDAT) %>%
 	as.data.frame(.)
 colnames(RAWDAT) <- NULL
 rownames(RAWDAT) <- NULL
 RAWDAT2 <- RAWDAT %>%
 	cbind(roALZna,.) %>%
 	dplyr::arrange(.,ID_REF)
 ##Editing the file for R processing
 RAWDATID <- RAWDAT2[,1] %>%
 	as.matrix(.)
 RAWDATNUM <- RAWDAT2[,-1] %>%
 	mapply(.,FUN = as.numeric) %>%
 	t(.)
 ##Consolidating genes with the same name
 ###create empty matrix of size equal  to tabRDATID
 tabRDATID <- table(RAWDATID)
 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 j <- 1
 for(j in 1:length(tabRDATID)){
 	##Putting the ones without duplicates in their new homes
 	if(tabRDATID[j] == 1){
 		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
-	}
+	} else if(tabRDATID[j] > 1){
 	##Averaging duplicates and putting them in their new homes
-	if(tabRDATID[j] > 1){
 		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 	}
 	j <- j + 1
 }
 ##Scaling the Data
 scrawdat <- NuRDATN%>%
 	scale()
 attr(scrawdat,"scaled:center") <- NULL
 attr(scrawdat,"scaled:scale") <- NULL
 colnames(scrawdat) <- rownames(tabRDATID)
 ##Discretized the Data
 dialzdat <- scrawdat %>%
 	dndat(.) %>%
 	t()%>%
 	as.data.frame(.)
 colnames(dialzdat) <- rownames(RAWDATNUM)
 ##setting "ID_REF" as a new variable
 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 colnames(geneNAM) <- "ID_REF"
 rownames(dialzdat) <- NULL
 dialzdat <-bind_cols(geneNAM,dialzdat)
 ##NAs in a column
 x <- 2
 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 nacol[1,1] = "COL_NAs"
 for(x in 2:dim(dialzdat)[2]){
 	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 	x <- x + 1
 }
 colnames(nacol) <- colnames(dialzdat)
 dialzdat<-bind_rows(dialzdat,nacol)
 ##NAs in a row
 y <- 1
 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 for(y in 1:dim(dialzdat)[1]){
 	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 	y <- y + 1
 }
 colnames(narowd) <- "ROW_NAs"
 dialzdat <- bind_cols(dialzdat,narowd)
 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 colnames(RAWWORD) <- colnames(dialzdat)
 ##converting to character so that the clinical can be brought together with discrete data
 k <- 2
 for(k in 2:dim(dialzdat)[2]-1){
 	dialzdat[,k] <- as.character(dialzdat[,k])