Efrain Gonzalez / Cleaning and Fixing Data with R

1

##Posted 6/15/2017

2

3

1

#Libraries required to run the code

4

#Libraries required to run the code

2

library(pryr)

5

library(pryr)

3

library(MASS)

6

library(MASS)

4

library(dplyr)

7

library(dplyr)

5

library(tidyr)

8

library(tidyr)

6

library(readr)

9

library(readr)

7

library(stringr)

10

library(stringr)

8

11

9

12

10

#Necessary Functions

13

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

15

chngrownm <- function(mat){

13

row <- dim(mat)[1]

16

row <- dim(mat)[1]

14

col <- dim(mat)[2]

17

col <- dim(mat)[2]

15

j <- 1

18

j <- 1

16

x <- 1

19

x <- 1

17

p <- 1

20

p <- 1

18

a <- 1

21

a <- 1

19

b <- 1

22

b <- 1

20

g <- 1

23

g <- 1

21

for(j in 1:col){

24

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

25

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

26

colnames(mat)[j] <- "Brain_Region"

24

}

27

}

25

if("!Sample_title" == mat[1,j]){

28

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

29

colnames(mat)[j] <- "Title"

27

}

30

}

28

if("!Sample_geo_accession" == mat[1,j]){

31

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

32

colnames(mat)[j] <- "ID_REF"

30

} else{

33

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

34

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

35

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

36

x = x + 1

34

}

37

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

38

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

39

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

40

p = p + 1

38

}

41

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

42

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

43

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

44

a = a + 1

42

}

45

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

46

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

47

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

48

b = b + 1

46

}

49

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

50

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

51

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

52

g = g + 1

50

}

53

}

51

54

52

}

55

}

53

j = j + 1

56

j = j + 1

54

}

57

}

55

mat

58

mat

56

}

59

}

57

60

58

#2#Function for reorganizing information within the columns

61

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

62

cinfo <- function(mat){

60

col <- dim(mat)[2]

63

col <- dim(mat)[2]

61

j <-2

64

j <-2

62

for(j in 2:col){

65

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

66

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

67

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

68

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

69

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

71

as.integer()

69

}

72

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

73

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

74

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

75

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

76

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

77

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

78

as.numeric()

76

}

79

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

80

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

81

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

82

as.roman()%>%

80

as.integer()

83

as.integer()

81

}

84

}

82

j=j+1

85

j=j+1

83

}

86

}

84

mat

87

mat

85

}

88

}

86

89

87

#3#Function for labeling the gene IDs without names

90

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

91

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

92

row <- dim(GIDNAM)[1]

90

i <- 1

93

i <- 1

91

for(i in 1:row){

94

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

95

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

96

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

97

}

95

i <- i + 1

98

i <- i + 1

96

}

99

}

97

GIDNAM

100

GIDNAM

98

}

101

}

99

102

100

#4#Function for changing the gene ID to gene name

103

#4#Function for changing the gene ID to gene name

101

cgeneID <- function(GeneName,DATA){

104

cgeneID <- function(GeneName,DATA){

102

colGene <- dim(GeneName)[2]

105

colGene <- dim(GeneName)[2]

103

j <- 1

106

j <- 1

104

for(j in 1:colGene){

107

for(j in 1:colGene){

105

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

108

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

106

if(is.na(sum(chngsreq))==FALSE){

109

if(is.na(sum(chngsreq))==FALSE){

107

if(sum(chngsreq) > 0){

110

if(sum(chngsreq) > 0){

108

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

111

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

109

}

112

}

110

}

113

}

111

#if(sum(chngsreq) > 0){

114

#if(sum(chngsreq) > 0){

112

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

115

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

113

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

116

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

114

#}

117

#}

115

j = j+1

118

j = j+1

116

}

119

}

117

DATA

120

DATA

118

}

121

}

119

122

120

#5#Function for adjusting the gene names

123

#5#Function for adjusting the gene names

121

gcnames <- function(DiData,usecol=1){

124

gcnames <- function(DiData,usecol=1){

122

nuruns <- dim(DiData)[2]

125

nuruns <- dim(DiData)[2]

123

i = 1

126

i = 1

124

nwnam <- rep("0",length.out=nuruns)

127

nwnam <- rep("0",length.out=nuruns)

125

for(i in 1:nuruns){

128

for(i in 1:nuruns){

126

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

129

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

127

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

130

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

128

} else{

131

} else{

129

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

132

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

130

}

133

}

131

134

132

}

135

}

133

nwnam

136

nwnam

134

137

135

}

138

}

136

139

137

#6# Function for discretizing the data

140

#6# Function for discretizing the data

138

dndat <- function(NDATA){

141

dndat <- function(NDATA){

139

rownd <- dim(NDATA)[1]

142

rownd <- dim(NDATA)[1]

140

colnd <- dim(NDATA)[2]

143

colnd <- dim(NDATA)[2]

141

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

144

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

142

colnames(DDATA) <- colnames(NDATA)

145

colnames(DDATA) <- colnames(NDATA)

143

i <- 1

146

i <- 1

144

for(i in 1:rownd){

147

for(i in 1:rownd){

145

j <- 1

148

j <- 1

146

for(j in 1:colnd){

149

for(j in 1:colnd){

147

if(is.na(NDATA[i,j])==FALSE){

150

if(is.na(NDATA[i,j])==FALSE){

148

151

149

if(NDATA[i,j] < -1){

152

if(NDATA[i,j] < -1){

150

DDATA[i,j]=0L

153

DDATA[i,j]=0L

151

}

154

}

152

if(NDATA[i,j] > 1){

155

if(NDATA[i,j] > 1){

153

DDATA[i,j]=2L

156

DDATA[i,j]=2L

154

}

157

}

155

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

158

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

156

DDATA[i,j]=1L

159

DDATA[i,j]=1L

157

}

160

}

158

} else{

161

} else{

159

DDATA[i,j] = NDATA[i,j]

162

DDATA[i,j] = NDATA[i,j]

160

}

163

}

161

j = j + 1

164

j = j + 1

162

}

165

}

163

i = i + 1

166

i = i + 1

164

}

167

}

165

DDATA

168

DDATA

166

}

169

}

167

170

168

171

169

#The Rest of this code will be used every time you want to change a data set

172

#The Rest of this code will be used every time you want to change a data set

170

173

171

#Getting the series matrix file

174

#Getting the series matrix file

172

print("Choose the series matrix file that you want to Analyze")

175

print("Choose the series matrix file that you want to Analyze")

173

alz <- file.choose()

176

alz <- file.choose()

174

177

175

#Getting the GPL file

178

#Getting the GPL file

176

print("Choose the GPL file that correlates with the above series matrix file")

179

print("Choose the GPL file that correlates with the above series matrix file")

177

genena <- file.choose()

180

genena <- file.choose()

178

181

179

182

180

#Find out if it is a soft GPL file or not

183

#Find out if it is a soft GPL file or not

181

soft <- strsplit(genena,"[\\|/]") %>%

184

soft <- strsplit(genena,"[\\|/]") %>%

182

.[[1]] %>%

185

.[[1]] %>%

183

.[length(.)] %>%

186

.[length(.)] %>%

184

grepl("soft|annot",.)

187

grepl("soft|annot",.)

185

188

186

#Working with the wordy part of the document

189

#Working with the wordy part of the document

187

alzword <- alz %>%

190

alzword <- alz %>%

188

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

191

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

189

filter(grepl("!Sample",X1))%>%

192

filter(grepl("!Sample",X1))%>%

190

filter(!grepl("!Sample_contact",X1))

193

filter(!grepl("!Sample_contact",X1))

191

194

192

##Changing row names and column names:

195

##Changing row names and column names:

193

ALZWORD <- t(alzword)

196

ALZWORD <- t(alzword)

194

rownames(ALZWORD)=NULL

197

rownames(ALZWORD)=NULL

195

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

198

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

196

ALZWORD <- chngrownm(ALZWORD)[-1,]

199

ALZWORD <- chngrownm(ALZWORD)[-1,]

197

ALZWORD <- ALZWORD%>%

200

ALZWORD <- ALZWORD%>%

198

as.data.frame()%>%

201

as.data.frame()%>%

199

dplyr::select(-starts_with("col"))

202

dplyr::select(-starts_with("col"))

200

203

201

##Reorganizing information within the columns

204

##Reorganizing information within the columns

202

ALZWORDF <- cinfo(ALZWORD)

205

ALZWORDF <- cinfo(ALZWORD)

203

206

204

207

205

#Working with Actual Data part of file

208

#Working with Actual Data part of file

206

alzdat <- alz %>%

209

alzdat <- alz %>%

207

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

210

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

208

ALZDAT <- t(alzdat[,-1])

211

ALZDAT <- t(alzdat[,-1])

209

rownames(ALZDAT)=NULL

212

rownames(ALZDAT)=NULL

210

213

211

##Is there a clean version of the GPL file available?

214

##Is there a clean version of the GPL file available?

212

gplnum <- strsplit(genena,"[\\|/]") %>%

215

gplnum <- strsplit(genena,"[\\|/]") %>%

213

.[[1]] %>%

216

.[[1]] %>%

214

.[length(.)] %>%

217

.[length(.)] %>%

215

gsub("\\D","",.)

218

gsub("\\D","",.)

216

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

219

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

217

if(clfileex >= 1){

220

if(clfileex >= 1){

218

#use the clean version

221

#use the clean version

219

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

222

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

220

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

223

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

221

224

222

}

225

}

223

if(clfileex == 0){

226

if(clfileex == 0){

224

##Lets Create a clean version

227

##Lets Create a clean version

225

228

226

##Gene ID to Gene Name

229

##Gene ID to Gene Name

227

if(soft == TRUE){

230

if(soft == TRUE){

228

#Check to see if there is already a file containing information on soft files

231

#Check to see if there is already a file containing information on soft files

229

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

232

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

230

if(fileex == 1){

233

if(fileex == 1){

231

#Check to see if this GPL soft file has been used before

234

#Check to see if this GPL soft file has been used before

232

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

235

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

233

.$GPL_FILE_NUM%>%

236

.$GPL_FILE_NUM%>%

234

grepl(gplnum,.) %>%

237

grepl(gplnum,.) %>%

235

sum()

238

sum()

236

if(IDF == 1){

239

if(IDF == 1){

237

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

240

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

238

.$GPL_FILE_NUM%>%

241

.$GPL_FILE_NUM%>%

239

grep(gplnum,.)

242

grep(gplnum,.)

240

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

243

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

241

.$LOC_ID %>%

244

.$LOC_ID %>%

242

.[IDLOCAL]

245

.[IDLOCAL]

243

geneIDNam <- genena %>%

246

geneIDNam <- genena %>%

244

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

247

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

245

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

248

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

246

}

249

}

247

if(IDF == 0){

250

if(IDF == 0){

248

#No information on this particular GPL file

251

#No information on this particular GPL file

249

idLOCGPL <- genena %>%

252

idLOCGPL <- genena %>%

250

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

253

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

251

t(.) %>%

254

t(.) %>%

252

grep("^ID\\s*$",.) %>%

255

grep("^ID\\s*$",.) %>%

253

-1

256

-1

254

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

257

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

255

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

258

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

256

geneIDNam <- genena %>%

259

geneIDNam <- genena %>%

257

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

258

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

259

}

262

}

260

}

263

}

261

if(fileex == 0){

264

if(fileex == 0){

262

#We must create a file that we can access for later use

265

#We must create a file that we can access for later use

263

idLOCGPL <- genena %>%

266

idLOCGPL <- genena %>%

264

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

267

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

265

t(.) %>%

268

t(.) %>%

266

grep("^ID\\s*$",.) %>%

269

grep("^ID\\s*$",.) %>%

267

-1

270

-1

268

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

271

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

269

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

272

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

270

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

273

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

271

geneIDNam <- genena %>%

274

geneIDNam <- genena %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

275

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

276

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

274

}

277

}

275

}

278

}

276

if(soft == FALSE){

279

if(soft == FALSE){

277

geneIDNam <- genena %>%

280

geneIDNam <- genena %>%

278

read_delim(delim="\t",comment = "#")%>%

281

read_delim(delim="\t",comment = "#")%>%

279

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

282

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

280

}

283

}

281

284

282

##Labeling the gene IDs without names

285

##Labeling the gene IDs without names

283

geneIDNam <- NAFIXING(geneIDNam)

286

geneIDNam <- NAFIXING(geneIDNam)

284

287

285

##remove the whitespace

288

##remove the whitespace

286

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

289

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

287

290

288

##Here is the clean version

291

##Here is the clean version

289

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

292

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

290

}

293

}

291

294

292

295

293

296

294

##Changing the gene ID to gene name

297

##Changing the gene ID to gene name

295

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

298

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

296

colnames(ALZDAT) = ALZDAT1[1,]

299

colnames(ALZDAT) = ALZDAT1[1,]

297

300

298

301

299

##Adjusting the column names aka the gene names

302

##Adjusting the column names aka the gene names

300

colnames(ALZDAT) <- gcnames(ALZDAT)

303

colnames(ALZDAT) <- gcnames(ALZDAT)

301

304

302

305

303

#Full RAW Data

306

#Full RAW Data

304

Fullalzdwr <- ALZDAT %>%

307

Fullalzdwr <- ALZDAT %>%

305

as.data.frame() %>%

308

as.data.frame() %>%

306

cbind(ALZWORDF,.)

309

cbind(ALZWORDF,.)

307

310

308

311

309

#Raw file is output

312

#Raw file is output

310

nfnaex <- strsplit(alz,"[\\]") %>%

313

nfnaex <- strsplit(alz,"[\\]") %>%

311

.[[1]] %>%

314

.[[1]] %>%

312

.[length(.)] %>%

315

.[length(.)] %>%

313

gsub("\\D","",.) %>%

316

gsub("\\D","",.) %>%

314

c("GSE",.,"aftexcel.txt") %>%

317

c("GSE",.,"aftexcel.txt") %>%

315

paste(collapse = "")

318

paste(collapse = "")

316

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

319

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

317

320

318

321

319

#Now for the discretization part

322

#Now for the discretization part

320

##get the wordy part again

323

##get the wordy part again

321

rawword <- t(ALZWORDF)

324

rawword <- t(ALZWORDF)

322

325

323

##where is ID_REF located

326

##where is ID_REF located

324

hereim <- grep("ID_REF",rawword[,1])

327

hereim <- grep("ID_REF",rawword[,1])

325

328

326

##Subject Names GSM...

329

##Subject Names GSM...

327

subjnam <- rawword[hereim,]

330

subjnam <- rawword[hereim,]

328

331

329

##Getting the names for the rows

332

##Getting the names for the rows

330

namedarows <- rownames(rawword)[-hereim] %>%

333

namedarows <- rownames(rawword)[-hereim] %>%

331

as.data.frame()

334

as.data.frame()

332

RAWWORD <- rawword[-hereim,] %>%

335

RAWWORD <- rawword[-hereim,] %>%

333

as.data.frame() %>%

336

as.data.frame() %>%

334

bind_cols(namedarows,.)

337

bind_cols(namedarows,.)

335

z <- 1

338

z <- 1

336

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

339

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

337

for(z in 1:dim(RAWWORD)[1]){

340

for(z in 1:dim(RAWWORD)[1]){

338

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

341

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

339

z <- z + 1

342

z <- z + 1

340

}

343

}

341

344

342

colnames(naroww) <- "ROW_NAs"

345

colnames(naroww) <- "ROW_NAs"

343

RAWWORD <- bind_cols(RAWWORD,naroww)

346

RAWWORD <- bind_cols(RAWWORD,naroww)

344

347

345

348

346

roALZna <- t(ALZDAT) %>%

349

roALZna <- t(ALZDAT) %>%

347

rownames(.) %>%

350

rownames(.) %>%

348

as.data.frame(.)

351

as.data.frame(.)

349

colnames(roALZna) <- "ID_REF"

352

colnames(roALZna) <- "ID_REF"

350

353

351

RAWDAT <- t(ALZDAT) %>%

354

RAWDAT <- t(ALZDAT) %>%

352

as.data.frame(.)

355

as.data.frame(.)

353

colnames(RAWDAT) <- NULL

356

colnames(RAWDAT) <- NULL

354

rownames(RAWDAT) <- NULL

357

rownames(RAWDAT) <- NULL

355

358

356

RAWDAT2 <- RAWDAT %>%

359

RAWDAT2 <- RAWDAT %>%

357

cbind(roALZna,.) %>%

360

cbind(roALZna,.) %>%

358

dplyr::arrange(.,ID_REF)

361

dplyr::arrange(.,ID_REF)

359

362

360

##Editing the file for R processing

363

##Editing the file for R processing

361

RAWDATID <- RAWDAT2[,1] %>%

364

RAWDATID <- RAWDAT2[,1] %>%

362

as.matrix(.)

365

as.matrix(.)

363

366

364

RAWDATNUM <- RAWDAT2[,-1] %>%

367

RAWDATNUM <- RAWDAT2[,-1] %>%

365

mapply(.,FUN = as.numeric) %>%

368

mapply(.,FUN = as.numeric) %>%

366

t(.)

369

t(.)

367

370

368

##Consolidating genes with the same name

371

##Consolidating genes with the same name

369

###create empty matrix of size equal to tabRDATID

372

###create empty matrix of size equal to tabRDATID

370

tabRDATID <- table(RAWDATID)

373

tabRDATID <- table(RAWDATID)

371

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

374

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

372

j <- 1

375

j <- 1

373

for(j in 1:length(tabRDATID)){

376

for(j in 1:length(tabRDATID)){

374

377

375

##Putting the ones without duplicates in their new homes

378

##Putting the ones without duplicates in their new homes

376

if(tabRDATID[j] == 1){

379

if(tabRDATID[j] == 1){

377

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

380

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

378

}

381

}

379

##Averaging duplicates and putting them in their new homes

382

##Averaging duplicates and putting them in their new homes

380

if(tabRDATID[j] > 1){

383

if(tabRDATID[j] > 1){

381

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

384

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

382

}

385

}

383

j <- j + 1

386

j <- j + 1

384

}

387

}

385

388

386

##Scaling the Data

389

##Scaling the Data

387

scrawdat <- NuRDATN%>%

390

scrawdat <- NuRDATN%>%

388

scale()

391

scale()

389

attr(scrawdat,"scaled:center") <- NULL

392

attr(scrawdat,"scaled:center") <- NULL

390

attr(scrawdat,"scaled:scale") <- NULL

393

attr(scrawdat,"scaled:scale") <- NULL

391

colnames(scrawdat) <- rownames(tabRDATID)

394

colnames(scrawdat) <- rownames(tabRDATID)

392

395

393

##Discretized the Data

396

##Discretized the Data

394

dialzdat <- scrawdat %>%

397

dialzdat <- scrawdat %>%

395

dndat(.) %>%

398

dndat(.) %>%

396

t()%>%

399

t()%>%

397

as.data.frame(.)

400

as.data.frame(.)

398

colnames(dialzdat) <- rownames(RAWDATNUM)

401

colnames(dialzdat) <- rownames(RAWDATNUM)

399

402

400

##setting "ID_REF" as a new variable

403

##setting "ID_REF" as a new variable

401

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

404

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

402

colnames(geneNAM) <- "ID_REF"

405

colnames(geneNAM) <- "ID_REF"

403

rownames(dialzdat) <- NULL

406

rownames(dialzdat) <- NULL

404

dialzdat <-bind_cols(geneNAM,dialzdat)

407

dialzdat <-bind_cols(geneNAM,dialzdat)

405

408

406

##NAs in a column

409

##NAs in a column

407

x <- 2

410

x <- 2

408

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

411

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

409

nacol[1,1] = "COL_NAs"

412

nacol[1,1] = "COL_NAs"

410

for(x in 2:dim(dialzdat)[2]){

413

for(x in 2:dim(dialzdat)[2]){

411

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

414

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

412

x <- x + 1

415

x <- x + 1

413

}

416

}

414

colnames(nacol) <- colnames(dialzdat)

417

colnames(nacol) <- colnames(dialzdat)

415

dialzdat<-bind_rows(dialzdat,nacol)

418

dialzdat<-bind_rows(dialzdat,nacol)

416

419

417

##NAs in a row

420

##NAs in a row

418

y <- 1

421

y <- 1

419

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

422

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

420

for(y in 1:dim(dialzdat)[1]){

423

for(y in 1:dim(dialzdat)[1]){

421

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

424

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

422

y <- y + 1

425

y <- y + 1

423

}

426

}

424

colnames(narowd) <- "ROW_NAs"

427

colnames(narowd) <- "ROW_NAs"

425

dialzdat <- bind_cols(dialzdat,narowd)

428

dialzdat <- bind_cols(dialzdat,narowd)

426

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

429

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

427

colnames(RAWWORD) <- colnames(dialzdat)

430

colnames(RAWWORD) <- colnames(dialzdat)

428

##converting to character so that the clinical can be brought together with discrete data

431

##converting to character so that the clinical can be brought together with discrete data

429

k <- 2

432

k <- 2

430

for(k in 2:dim(dialzdat)[2]-1){

433

for(k in 2:dim(dialzdat)[2]-1){

431

dialzdat[,k] <- as.character(dialzdat[,k])

434

dialzdat[,k] <- as.character(dialzdat[,k])

432

k <- k + 1

435

k <- k + 1

433

}

436

}

434

#The End the full data

437

#The End the full data

435

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

438

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

436

439

437

#Produces Discrete file

440

#Produces Discrete file

438

nfnaex <- strsplit(rawdat,"[\\|/]") %>%

441

nfnaex <- strsplit(rawdat,"[\\|/]") %>%

439

.[[1]] %>%

442

.[[1]] %>%

440

.[length(.)] %>%

443

.[length(.)] %>%

441

gsub("\\D","",.) %>%

444

gsub("\\D","",.) %>%

442

c("GSE",.,"dscrt.txt") %>%

445

c("GSE",.,"dscrt.txt") %>%

443

paste(collapse = "")

446

paste(collapse = "")

444

write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)

447

write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)

445

448

446

449

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Outputs raw and discretized data files(UNTESTED)

+##Posted 6/15/2017
 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	j <- 1
 	x <- 1
 	p <- 1
 	a <- 1
 	b <- 1
 	g <- 1
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"
 		}
 		if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
 		}
 		if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Sex",x)
 				x = x + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("PMI",p)
 				p = p + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Age",a)
 				a = a + 1
 			 }
 			if(grepl("braak|b&b",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Braak",b)
 				b = b + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Group",g)
 				g = g + 1
 			}
 		}
 		j = j + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
 		if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
 		if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
 		if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		}
 		if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
     colGene <- dim(GeneName)[2]
      j <- 1
      for(j in 1:colGene){
 	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
 	if(is.na(sum(chngsreq))==FALSE){
 		if(sum(chngsreq) > 0){
 			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 		}
 	}
 		#if(sum(chngsreq) > 0){
 		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
 		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 		#}
 	j = j+1
 	}
 	DATA
 }
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])
 		} else{
 			nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])
 		}
 	}
 	nwnam
 }
 #6# Function for discretizing the data
 dndat <- function(NDATA){
 	rownd <- dim(NDATA)[1]
 	colnd <- dim(NDATA)[2]
 	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
 	colnames(DDATA) <- colnames(NDATA)
 	i <- 1
 	for(i in 1:rownd){
 		j <- 1
 		for(j in 1:colnd){
 			if(is.na(NDATA[i,j])==FALSE){
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
 				}
 				if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
 				}
 				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
 				DDATA[i,j] = NDATA[i,j]
 			}
 			j = j + 1
 		}
 		i = i + 1
 	}
 	DDATA
 }
 #The Rest of this code will be used every time you want to change a data set
 #Getting the series matrix file
 print("Choose the series matrix file that you want to Analyze")
 alz <- file.choose()
 #Getting the GPL file
 print("Choose the GPL file that correlates with the above series matrix file")
 genena <- file.choose()
 #Find out if it is a soft GPL file or not
 soft <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	grepl("soft|annot",.)
 #Working with the wordy part of the document
 alzword <- alz %>%
 	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 	filter(grepl("!Sample",X1))%>%
 	filter(!grepl("!Sample_contact",X1))
 ##Changing row names and column names:
 ALZWORD <- t(alzword)
 rownames(ALZWORD)=NULL
 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 ALZWORD <- chngrownm(ALZWORD)[-1,]
 ALZWORD <- ALZWORD%>%
 	as.data.frame()%>%
 	dplyr::select(-starts_with("col"))
 ##Reorganizing information within the columns
 ALZWORDF <- cinfo(ALZWORD)
 #Working with Actual Data part of file
 alzdat <- alz %>%
 	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 ALZDAT <- t(alzdat[,-1])
 rownames(ALZDAT)=NULL
 ##Is there a clean version of the GPL file available?
 gplnum <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.)
 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 if(clfileex >= 1){
 #use the clean version
 geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 }
 if(clfileex == 0){
 ##Lets Create a clean version
 ##Gene ID to Gene Name
 	if(soft == TRUE){
 		#Check to see if there is already a file containing information on soft files
 		fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 		if(fileex == 1){
 			#Check to see if this GPL soft file has been used before
 			IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 				.$GPL_FILE_NUM%>%
 				grepl(gplnum,.) %>%
 				sum()
 			if(IDF == 1){
 				IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$GPL_FILE_NUM%>%
 					grep(gplnum,.)
 				idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$LOC_ID %>%
 					.[IDLOCAL]
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 			}
 			if(IDF == 0){
 				#No information on this particular GPL file
 				idLOCGPL <- genena %>%
 					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 					t(.) %>%
 					grep("^ID\\s*$",.) %>%
 					-1
 				cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 					cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 			}
 		}
 		if(fileex == 0){
 			#We must create a file that we can access for later use
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 				t(.) %>%
 				grep("^ID\\s*$",.) %>%
 				-1
 			Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 			colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 			write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 			geneIDNam <- genena %>%
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 		}
 	 }
 	if(soft == FALSE){
 		geneIDNam <- genena %>%
 		read_delim(delim="\t",comment = "#")%>%
 		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))
 	}
 	##Labeling the gene IDs without names
 	geneIDNam <- NAFIXING(geneIDNam)
 	##remove the whitespace
 	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 	##Here is the clean version
 	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 }
 ##Changing the gene ID to gene name
 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 colnames(ALZDAT) = ALZDAT1[1,]
 ##Adjusting the column names aka the gene names
 colnames(ALZDAT) <- gcnames(ALZDAT)
 #Full RAW Data
 Fullalzdwr <- ALZDAT %>%
 	as.data.frame() %>%
 	cbind(ALZWORDF,.)
 #Raw file is output
 nfnaex <- strsplit(alz,"[\\]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"aftexcel.txt") %>%
 	paste(collapse = "")
 write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")
 #Now for the discretization part
 ##get the wordy part again
 rawword <- t(ALZWORDF)
 ##where is ID_REF located
 hereim <- grep("ID_REF",rawword[,1])
 ##Subject Names GSM...
 subjnam <- rawword[hereim,]
 ##Getting the names for the rows
 namedarows <- rownames(rawword)[-hereim] %>%
 	as.data.frame()
 RAWWORD <- rawword[-hereim,] %>%
 	as.data.frame() %>%
 	bind_cols(namedarows,.)
 z <- 1
 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 for(z in 1:dim(RAWWORD)[1]){
 	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 	z <- z + 1
 }
 colnames(naroww) <- "ROW_NAs"
 RAWWORD <- bind_cols(RAWWORD,naroww)
 roALZna <- t(ALZDAT) %>%
 	rownames(.) %>%
 	as.data.frame(.)
 colnames(roALZna) <- "ID_REF"
 RAWDAT <- t(ALZDAT) %>%
 	as.data.frame(.)
 colnames(RAWDAT) <- NULL
 rownames(RAWDAT) <- NULL
 RAWDAT2 <- RAWDAT %>%
 	cbind(roALZna,.) %>%
 	dplyr::arrange(.,ID_REF)
 ##Editing the file for R processing
 RAWDATID <- RAWDAT2[,1] %>%
 	as.matrix(.)
 RAWDATNUM <- RAWDAT2[,-1] %>%
 	mapply(.,FUN = as.numeric) %>%
 	t(.)
 ##Consolidating genes with the same name
 ###create empty matrix of size equal  to tabRDATID
 tabRDATID <- table(RAWDATID)
 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 j <- 1
 for(j in 1:length(tabRDATID)){
 	##Putting the ones without duplicates in their new homes
 	if(tabRDATID[j] == 1){
 		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 	}
 	##Averaging duplicates and putting them in their new homes
 	if(tabRDATID[j] > 1){
 		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 	}
 	j <- j + 1
 }
 ##Scaling the Data
 scrawdat <- NuRDATN%>%
 	scale()
 attr(scrawdat,"scaled:center") <- NULL
 attr(scrawdat,"scaled:scale") <- NULL
 colnames(scrawdat) <- rownames(tabRDATID)
 ##Discretized the Data
 dialzdat <- scrawdat %>%
 	dndat(.) %>%
 	t()%>%
 	as.data.frame(.)
 colnames(dialzdat) <- rownames(RAWDATNUM)
 ##setting "ID_REF" as a new variable
 geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 colnames(geneNAM) <- "ID_REF"
 rownames(dialzdat) <- NULL
 dialzdat <-bind_cols(geneNAM,dialzdat)
 ##NAs in a column
 x <- 2
 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 nacol[1,1] = "COL_NAs"
 for(x in 2:dim(dialzdat)[2]){
 	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 	x <- x + 1
 }
 colnames(nacol) <- colnames(dialzdat)
 dialzdat<-bind_rows(dialzdat,nacol)
 ##NAs in a row
 y <- 1
 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 for(y in 1:dim(dialzdat)[1]){
 	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 	y <- y + 1
 }
 colnames(narowd) <- "ROW_NAs"
 dialzdat <- bind_cols(dialzdat,narowd)
 colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam
 colnames(RAWWORD) <- colnames(dialzdat)
 ##converting to character so that the clinical can be brought together with discrete data
 k <- 2
 for(k in 2:dim(dialzdat)[2]-1){
 	dialzdat[,k] <- as.character(dialzdat[,k])
 	k <- k + 1
 }
 #The End the full data
 Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)
 #Produces Discrete file
 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"dscrt.txt") %>%
 	paste(collapse = "")
 write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)