Efrain Gonzalez / Cleaning and Fixing Data with R

1

##Posted 6/15/2017

1

##Posted 6/15/2017

2

options(digits = 11)

2

options(digits = 11)

3

4

#Libraries required to run the code

4

#Libraries required to run the code

5

library(pryr)

5

library(pryr)

6

library(MASS)

6

library(MASS)

7

library(dplyr)

7

library(dplyr)

8

library(tidyr)

8

library(tidyr)

9

library(readr)

9

library(readr)

10

library(stringr)

10

library(stringr)

11

12

13

#Necessary Functions

13

#Necessary Functions

14

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

15

chngrownm <- function(mat){

15

chngrownm <- function(mat){

16

row <- dim(mat)[1]

16

row <- dim(mat)[1]

17

col <- dim(mat)[2]

17

col <- dim(mat)[2]

18

j <- 1

18

j <- 1

19

x <- 1

19

x <- 1

20

p <- 1

20

p <- 1

21

a <- 1

21

a <- 1

22

b <- 1

22

b <- 1

23

g <- 1

23

g <- 1

24

for(j in 1:col){

24

for(j in 1:col){

25

if("!Sample_source_name_ch1"==mat[1,j]){

25

if("!Sample_source_name_ch1"==mat[1,j]){

26

colnames(mat)[j] <- "Brain_Region"

26

colnames(mat)[j] <- "Brain_Region"

27

} else if("!Sample_title" == mat[1,j]){

27

} else if("!Sample_title" == mat[1,j]){

28

colnames(mat)[j] <- "Title"

28

colnames(mat)[j] <- "Title"

29

} else if("!Sample_geo_accession" == mat[1,j]){

29

} else if("!Sample_geo_accession" == mat[1,j]){

30

colnames(mat)[j] <- "ID_REF"

30

colnames(mat)[j] <- "ID_REF"

31

} else{

31

} else{

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

33

colnames(mat)[j] <- paste0("Sex",x)

33

colnames(mat)[j] <- paste0("Sex",x)

34

x = x + 1

34

x = x + 1

35

}

35

}

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

37

colnames(mat)[j] <- paste0("PMI",p)

37

colnames(mat)[j] <- paste0("PMI",p)

38

p = p + 1

38

p = p + 1

39

}

39

}

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

41

colnames(mat)[j] <- paste0("Age",a)

41

colnames(mat)[j] <- paste0("Age",a)

42

a = a + 1

42

a = a + 1

43

}

43

}

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

45

colnames(mat)[j] <- paste0("Braak",b)

45

colnames(mat)[j] <- paste0("Braak",b)

46

b = b + 1

46

b = b + 1

47

}

47

}

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

49

colnames(mat)[j] <- paste0("Group",g)

49

colnames(mat)[j] <- paste0("Group",g)

50

g = g + 1

50

g = g + 1

51

}

51

}

52

53

}

53

}

54

j = j + 1

54

j = j + 1

55

}

55

}

56

mat

56

mat

57

}

57

}

58

59

#2#Function for reorganizing information within the columns

59

#2#Function for reorganizing information within the columns

60

cinfo <- function(mat){

60

cinfo <- function(mat){

61

col <- dim(mat)[2]

61

col <- dim(mat)[2]

62

j <-2

62

j <-2

63

for(j in 2:col){

63

for(j in 2:col){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

66

}

66

}

67

if(grepl("Age",colnames(mat)[j])==TRUE){

67

if(grepl("Age",colnames(mat)[j])==TRUE){

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

69

as.integer()

69

as.integer()

70

}

70

}

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

73

}

73

}

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

76

as.numeric()

76

as.numeric()

77

}

77

}

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

80

as.roman()%>%

80

as.roman()%>%

81

as.integer()

81

as.integer()

82

}

82

}

83

j=j+1

83

j=j+1

84

}

84

}

85

mat

85

mat

86

}

86

}

87

88

#3#Function for labeling the gene IDs without names

88

#3#Function for labeling the gene IDs without names

89

NAFIXING <- function(GIDNAM){

89

NAFIXING <- function(GIDNAM){

90

row <- dim(GIDNAM)[1]

90

row <- dim(GIDNAM)[1]

91

i <- 1

91

i <- 1

92

for(i in 1:row){

92

for(i in 1:row){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

94

GIDNAM[i,2] <- GIDNAM[i,1]

94

GIDNAM[i,2] <- GIDNAM[i,1]

95

}

95

}

96

i <- i + 1

96

i <- i + 1

97

}

97

}

98

GIDNAM

98

GIDNAM

99

}

99

}

100

101

#4#Function for changing the gene ID to gene name

101

#4#Function for changing the gene ID to gene name

102

cgeneID <- function(GeneName,DATA){

102

cgeneID <- function(GeneName,DATA){

103

nj <- t(GeneName)

103

nj <- t(GeneName)

104

nq <- t(DATA)

104

nq <- t(DATA)

105

colGene <- dim(nj)[2]

105

colGene <- dim(nj)[2]

106

colDATA <- dim(nq)[2]

106

colDATA <- dim(nq)[2]

107

j <- 1

107

j <- 1

108

for(j in 1:colDATA){

108

for(j in 1:colDATA){

109

#where is that gene id located within the GPL file

109

#where is that gene id located within the GPL file

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

111

if(is.na(sum(chngreq))==FALSE){

111

if(is.na(sum(chngreq))==FALSE){

112

if(sum(chngreq) > 0){

112

if(sum(chngreq) > 0){

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

114

}

114

}

115

}

115

}

116

j <- j + 1

116

j <- j + 1

117

}

117

}

118

nq

118

nq

119

}

119

}

120

#cgeneID <- function(GeneName,DATA){

120

#cgeneID <- function(GeneName,DATA){

121

# colGene <- dim(GeneName)[2]

121

# colGene <- dim(GeneName)[2]

122

# j <- 1

122

# j <- 1

123

# for(j in 1:colGene){

123

# for(j in 1:colGene){

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

125

# if(is.na(sum(chngsreq))==FALSE){

125

# if(is.na(sum(chngsreq))==FALSE){

126

# if(sum(chngsreq) > 0){

126

# if(sum(chngsreq) > 0){

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

128

# }

128

# }

129

# }

129

# }

130

# #if(sum(chngsreq) > 0){

130

# #if(sum(chngsreq) > 0){

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# #}

133

# #}

134

# j = j+1

134

# j = j+1

135

# }

135

# }

136

# DATA

136

# DATA

137

#}

137

#}

138

139

#5#Function for adjusting the gene names

139

#5#Function for adjusting the gene names

140

gcnames <- function(DiData,usecol=1){

140

gcnames <- function(DiData,usecol=1){

141

nuruns <- dim(DiData)[2]

141

nuruns <- dim(DiData)[2]

142

i = 1

142

i = 1

143

nwnam <- rep("0",length.out=nuruns)

143

nwnam <- rep("0",length.out=nuruns)

144

for(i in 1:nuruns){

144

for(i in 1:nuruns){

145

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

145

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

147

} else{

147

} else{

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

149

}

149

}

150

151

}

151

}

152

nwnam

152

nwnam

153

154

}

154

}

155

156

#6# Function for discretizing the data

156

#6# Function for discretizing the data

157

dndat <- function(NDATA){

157

dndat <- function(NDATA){

158

rownd <- dim(NDATA)[1]

158

rownd <- dim(NDATA)[1]

159

colnd <- dim(NDATA)[2]

159

colnd <- dim(NDATA)[2]

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

161

colnames(DDATA) <- colnames(NDATA)

161

colnames(DDATA) <- colnames(NDATA)

162

i <- 1

162

i <- 1

163

for(i in 1:rownd){

163

for(i in 1:rownd){

164

j <- 1

164

j <- 1

165

for(j in 1:colnd){

165

for(j in 1:colnd){

166

if(is.na(NDATA[i,j])==FALSE){

166

if(is.na(NDATA[i,j])==FALSE){

167

168

if(NDATA[i,j] < -1){

168

if(NDATA[i,j] < -1){

169

DDATA[i,j]=0L

169

DDATA[i,j]=0L

170

} else if(NDATA[i,j] > 1){

170

} else if(NDATA[i,j] > 1){

171

DDATA[i,j]=2L

171

DDATA[i,j]=2L

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

173

DDATA[i,j]=1L

173

DDATA[i,j]=1L

174

}

174

}

175

} else{

175

} else{

176

DDATA[i,j] = NDATA[i,j]

176

DDATA[i,j] = NDATA[i,j]

177

}

177

}

178

j = j + 1

178

j = j + 1

179

}

179

}

180

i = i + 1

180

i = i + 1

181

}

181

}

182

DDATA

182

DDATA

183

}

183

}

184

185

186

#The Rest of this code will be used every time you want to change a data set

186

#The Rest of this code will be used every time you want to change a data set

187

188

#Getting the series matrix file

188

#Getting the series matrix file

189

print("Choose the series matrix file that you want to Analyze")

189

print("Choose the series matrix file that you want to Analyze")

190

alz <- file.choose()

190

alz <- file.choose()

191

192

#Getting the GPL file

192

#Getting the GPL file

193

print("Choose the GPL file that correlates with the above series matrix file")

193

print("Choose the GPL file that correlates with the above series matrix file")

194

genena <- file.choose()

194

genena <- file.choose()

195

196

197

#Find out if it is a soft GPL file or not

197

#Find out if it is a soft GPL file or not

198

soft <- strsplit(genena,"[\\|/]") %>%

198

soft <- strsplit(genena,"[\\|/]") %>%

199

.[[1]] %>%

199

.[[1]] %>%

200

.[length(.)] %>%

200

.[length(.)] %>%

201

grepl("soft|annot",.)

201

grepl("soft|annot",.)

202

203

#Working with the wordy part of the document

203

#Working with the wordy part of the document

204

alzword <- alz %>%

204

alzword <- alz %>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

206

filter(grepl("!Sample",X1))%>%

206

filter(grepl("!Sample",X1))%>%

207

filter(!grepl("!Sample_contact",X1))

207

filter(!grepl("!Sample_contact",X1))

208

209

##Changing row names and column names:

209

##Changing row names and column names:

210

ALZWORD <- t(alzword)

210

ALZWORD <- t(alzword)

211

rownames(ALZWORD)=NULL

211

rownames(ALZWORD)=NULL

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

214

ALZWORD <- ALZWORD%>%

214

ALZWORD <- ALZWORD%>%

215

as.data.frame(.,stringsAsFactors = FALSE)%>%

215

as.data.frame(.,stringsAsFactors = FALSE)%>%

216

dplyr::select(-starts_with("col"))

216

dplyr::select(-starts_with("col"))

217

218

##Reorganizing information within the columns

218

##Reorganizing information within the columns

219

ALZWORDF <- cinfo(ALZWORD)

219

ALZWORDF <- cinfo(ALZWORD)

220

221

222

#Working with Actual Data part of file

222

#Working with Actual Data part of file

223

alzdat <- alz %>%

223

alzdat <- alz %>%

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

225

ALZDAT <- t(alzdat[,-1])

225

ALZDAT <- t(alzdat[,-1])

226

rownames(ALZDAT)=NULL

226

rownames(ALZDAT)=NULL

227

228

##Is there a clean version of the GPL file available?

228

##Is there a clean version of the GPL file available?

229

gplnum <- strsplit(genena,"[\\|/]") %>%

229

gplnum <- strsplit(genena,"[\\|/]") %>%

230

.[[1]] %>%

230

.[[1]] %>%

231

.[length(.)] %>%

231

.[length(.)] %>%

232

gsub("\\D","",.)

232

gsub("\\D","",.)

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

234

if(clfileex >= 1){

234

if(clfileex >= 1){

235

#use the clean version

235

#use the clean version

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

238

239

} else if(clfileex == 0){

239

} else if(clfileex == 0){

240

##Lets Create a clean version

240

##Lets Create a clean version

241

242

##Gene ID to Gene Name

242

##Gene ID to Gene Name

243

if(soft == TRUE){

243

if(soft == TRUE){

244

#Check to see if there is already a file containing information on soft files

244

#Check to see if there is already a file containing information on soft files

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

246

if(fileex == 1){

246

if(fileex == 1){

247

#Check to see if this GPL soft file has been used before

247

#Check to see if this GPL soft file has been used before

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

249

.$GPL_FILE_NUM%>%

249

.$GPL_FILE_NUM%>%

250

grepl(gplnum,.) %>%

250

grepl(gplnum,.) %>%

251

sum()

251

sum()

252

if(IDF == 1){

252

if(IDF == 1){

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

254

.$GPL_FILE_NUM%>%

254

.$GPL_FILE_NUM%>%

255

grep(gplnum,.)

255

grep(gplnum,.)

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

257

.$LOC_ID %>%

257

.$LOC_ID %>%

258

.[IDLOCAL]

258

.[IDLOCAL]

259

geneIDNam <- genena %>%

259

geneIDNam <- genena %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

262

} else if(IDF == 0){

262

} else if(IDF == 0){

263

#No information on this particular GPL file

263

#No information on this particular GPL file

264

idLOCGPL <- genena %>%

264

idLOCGPL <- genena %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

266

t(.) %>%

266

t(.) %>%

267

grep("^ID\\s*$",.) %>%

267

grep("^ID\\s*$",.) %>%

268

-1

268

-1

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

271

geneIDNam <- genena %>%

271

geneIDNam <- genena %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

274

}

274

}

275

} else if(fileex == 0){

275

} else if(fileex == 0){

276

#We must create a file that we can access for later use

276

#We must create a file that we can access for later use

277

idLOCGPL <- genena %>%

277

idLOCGPL <- genena %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

279

t(.) %>%

279

t(.) %>%

280

grep("^ID\\s*$",.) %>%

280

grep("^ID\\s*$",.) %>%

281

-1

281

-1

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

285

geneIDNam <- genena %>%

285

geneIDNam <- genena %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

287

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

287

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

288

}

288

}

289

} else if(soft == FALSE){

289

} else if(soft == FALSE){

290

geneIDNam <- genena %>%

290

geneIDNam <- genena %>%

291

read_delim(delim="\t",comment = "#")%>%

291

read_delim(delim="\t",comment = "#")%>%

292

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$",colnames(.)))

292

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

293

}

293

}

294

295

##Labeling the gene IDs without names

295

##Labeling the gene IDs without names

296

geneIDNam <- NAFIXING(geneIDNam)

296

geneIDNam <- NAFIXING(geneIDNam)

297

298

##remove the whitespace

298

##remove the whitespace

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

300

301

##Here is the clean version

301

##Here is the clean version

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

303

}

303

}

304

305

306

307

##Changing the gene ID to gene name

307

##Changing the gene ID to gene name

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

309

colnames(ALZDAT) = ALZDAT1[1,]

309

colnames(ALZDAT) = ALZDAT1[1,]

310

311

312

##Adjusting the column names aka the gene names

312

##Adjusting the column names aka the gene names

313

colnames(ALZDAT) <- gcnames(ALZDAT)

313

colnames(ALZDAT) <- gcnames(ALZDAT)

314

315

316

#Full RAW Data

316

#Full RAW Data

317

Fullalzdwr <- ALZDAT %>%

317

Fullalzdwr <- ALZDAT %>%

318

as.data.frame(.,stringsAsFactors = FALSE) %>%

318

as.data.frame(.,stringsAsFactors = FALSE) %>%

319

cbind(ALZWORDF,.)

319

cbind(ALZWORDF,.)

320

321

322

#Raw file is output

322

#Raw file is output

323

nfnaex <- strsplit(alz,"[\\]") %>%

323

nfnaex <- strsplit(alz,"[\\]") %>%

324

.[[1]] %>%

324

.[[1]] %>%

325

.[length(.)] %>%

325

.[length(.)] %>%

326

gsub("\\D","",.) %>%

326

gsub("\\D","",.) %>%

327

c("GSE",.,"aftexcel.txt") %>%

327

c("GSE",.,"aftexcel.txt") %>%

328

paste(collapse = "")

328

paste(collapse = "")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

330

331

332

#Now for the discretization part

332

#Now for the discretization part

333

##get the wordy part again

333

##get the wordy part again

334

rawword <- t(ALZWORDF)

334

rawword <- t(ALZWORDF)

335

336

##where is ID_REF located

336

##where is ID_REF located

337

hereim <- grep("ID_REF",rownames(rawword))

337

hereim <- grep("ID_REF",rownames(rawword))

338

339

##Subject Names GSM...

339

##Subject Names GSM...

340

subjnam <- rawword[hereim,]

340

subjnam <- rawword[hereim,]

341

342

##Getting the names for the rows

342

##Getting the names for the rows

343

namedarows <- rownames(rawword)[-hereim] %>%

343

namedarows <- rownames(rawword)[-hereim] %>%

344

as.data.frame(.,stringsAsFactors = FALSE)

344

as.data.frame(.,stringsAsFactors = FALSE)

345

RAWWORD <- rawword[-hereim,] %>%

345

RAWWORD <- rawword[-hereim,] %>%

346

as.data.frame(.,stringsAsFactors = FALSE) %>%

346

as.data.frame(.,stringsAsFactors = FALSE) %>%

347

bind_cols(namedarows,.)

347

bind_cols(namedarows,.)

348

z <- 1

348

z <- 1

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

350

for(z in 1:dim(RAWWORD)[1]){

350

for(z in 1:dim(RAWWORD)[1]){

351

if(sum(is.na(RAWWORD[z,])) > 0){

351

if(sum(is.na(RAWWORD[z,])) > 0){

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

353

}

353

}

354

if(length(grep("NA",RAWWORD[z,])) > 0){

354

if(length(grep("NA",RAWWORD[z,])) > 0){

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

356

}

356

}

357

z <- z + 1

357

z <- z + 1

358

}

358

}

359

360

colnames(naroww) <- "ROW_NAs"

360

colnames(naroww) <- "ROW_NAs"

361

RAWWORD <- bind_cols(RAWWORD,naroww)

361

RAWWORD <- bind_cols(RAWWORD,naroww)

362

363

364

roALZna <- t(ALZDAT) %>%

364

roALZna <- t(ALZDAT) %>%

365

rownames(.) %>%

365

rownames(.) %>%

366

as.data.frame(.,stringsAsFactors = FALSE)

366

as.data.frame(.,stringsAsFactors = FALSE)

367

colnames(roALZna) <- "ID_REF"

367

colnames(roALZna) <- "ID_REF"

368

369

RAWDAT <- t(ALZDAT) %>%

369

RAWDAT <- t(ALZDAT) %>%

370

as.data.frame(.,stringsAsFactors = FALSE)

370

as.data.frame(.,stringsAsFactors = FALSE)

371

colnames(RAWDAT) <- NULL

371

colnames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

373

374

RAWDAT2 <- RAWDAT %>%

374

RAWDAT2 <- RAWDAT %>%

375

cbind(roALZna,.) %>%

375

cbind(roALZna,.) %>%

376

dplyr::arrange(.,ID_REF)

376

dplyr::arrange(.,ID_REF)

377

378

##Editing the file for R processing

378

##Editing the file for R processing

379

RAWDATID <- RAWDAT2[,1] %>%

379

RAWDATID <- RAWDAT2[,1] %>%

380

as.matrix(.)

380

as.matrix(.)

381

382

RAWDATNUM <- RAWDAT2[,-1] %>%

382

RAWDATNUM <- RAWDAT2[,-1] %>%

383

mapply(.,FUN = as.numeric) %>%

383

mapply(.,FUN = as.numeric) %>%

384

t(.)

384

t(.)

385

386

##Consolidating genes with the same name

386

##Consolidating genes with the same name

387

###create empty matrix of size equal to tabRDATID

387

###create empty matrix of size equal to tabRDATID

388

tabRDATID <- table(RAWDATID)

388

tabRDATID <- table(RAWDATID)

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

390

j <- 1

390

j <- 1

391

for(j in 1:length(tabRDATID)){

391

for(j in 1:length(tabRDATID)){

392

393

##Putting the ones without duplicates in their new homes

393

##Putting the ones without duplicates in their new homes

394

if(tabRDATID[j] == 1){

394

if(tabRDATID[j] == 1){

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

396

} else if(tabRDATID[j] > 1){

396

} else if(tabRDATID[j] > 1){

397

##Averaging duplicates and putting them in their new homes

397

##Averaging duplicates and putting them in their new homes

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

399

}

399

}

400

j <- j + 1

400

j <- j + 1

401

}

401

}

402

403

##Scaling the Data

403

##Scaling the Data

404

scrawdat <- NuRDATN%>%

404

scrawdat <- NuRDATN%>%

405

scale()

405

scale()

406

attr(scrawdat,"scaled:center") <- NULL

406

attr(scrawdat,"scaled:center") <- NULL

407

attr(scrawdat,"scaled:scale") <- NULL

407

attr(scrawdat,"scaled:scale") <- NULL

408

colnames(scrawdat) <- rownames(tabRDATID)

408

colnames(scrawdat) <- rownames(tabRDATID)

409

410

#Outputting the Z-score file

410

#Outputting the Z-score file

411

nfnzsc <- strsplit(alz,"[\\]") %>%

411

nfnzsc <- strsplit(alz,"[\\]") %>%

412

.[[1]] %>%

412

.[[1]] %>%

413

.[length(.)] %>%

413

.[length(.)] %>%

414

gsub("\\D","",.) %>%

414

gsub("\\D","",.) %>%

415

c("GSE",.,"zscore.txt") %>%

415

c("GSE",.,"zscore.txt") %>%

416

paste(collapse = "")

416

paste(collapse = "")

417

zscraw <- scrawdat %>%

417

zscraw <- scrawdat %>%

418

t()%>%

418

t()%>%

419

as.data.frame(.,stringsAsFactors = FALSE)

419

as.data.frame(.,stringsAsFactors = FALSE)

420

colnames(zscraw) <- subjnam

420

colnames(zscraw) <- subjnam

421

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)

421

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = TRUE)

422

423

##Discretized the Data

423

##Discretized the Data

424

dialzdat <- scrawdat %>%

424

dialzdat <- scrawdat %>%

425

dndat(.) %>%

425

dndat(.) %>%

426

t()%>%

426

t()%>%

427

as.data.frame(.,stringsAsFactors = FALSE)

427

as.data.frame(.,stringsAsFactors = FALSE)

428

colnames(dialzdat) <- rownames(RAWDATNUM)

428

colnames(dialzdat) <- rownames(RAWDATNUM)

429

430

##setting "ID_REF" as a new variable

430

##setting "ID_REF" as a new variable

431

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

431

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

432

colnames(geneNAM) <- "ID_REF"

432

colnames(geneNAM) <- "ID_REF"

433

rownames(dialzdat) <- NULL

433

rownames(dialzdat) <- NULL

434

dialzdat <-bind_cols(geneNAM,dialzdat)

434

dialzdat <-bind_cols(geneNAM,dialzdat)

435

436

##NAs in a column

436

##NAs in a column

437

x <- 2

437

x <- 2

438

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

438

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

439

nacol[1,1] = "COL_NAs"

439

nacol[1,1] = "COL_NAs"

440

for(x in 2:dim(dialzdat)[2]){

440

for(x in 2:dim(dialzdat)[2]){

441

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

441

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

442

x <- x + 1

442

x <- x + 1

443

}

443

}

444

colnames(nacol) <- colnames(dialzdat)

444

colnames(nacol) <- colnames(dialzdat)

445

dialzdat<-bind_rows(dialzdat,nacol)

445

dialzdat<-bind_rows(dialzdat,nacol)

446

447

##NAs in a row

447

##NAs in a row

448

y <- 1

448

y <- 1

449

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

449

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

450

for(y in 1:dim(dialzdat)[1]){

450

for(y in 1:dim(dialzdat)[1]){

451

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

451

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

452

y <- y + 1

452

y <- y + 1

453

}

453

}

454

colnames(narowd) <- "ROW_NAs"

454

colnames(narowd) <- "ROW_NAs"

455

dialzdat <- bind_cols(dialzdat,narowd)

455

dialzdat <- bind_cols(dialzdat,narowd)

456

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

456

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

457

colnames(RAWWORD) <- colnames(dialzdat)

457

colnames(RAWWORD) <- colnames(dialzdat)

458

##converting to character so that the clinical can be brought together with discrete data

458

##converting to character so that the clinical can be brought together with discrete data

459

k <- 2

459

k <- 2

460

for(k in 2:dim(dialzdat)[2]-1){

460

for(k in 2:dim(dialzdat)[2]-1){

461

dialzdat[,k] <- as.character(dialzdat[,k])

461

dialzdat[,k] <- as.character(dialzdat[,k])

462

k <- k + 1

462

k <- k + 1

463

}

463

}

464

#The End the full data

464

#The End the full data

465

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

465

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

466

467

#Produces Discrete file

467

#Produces Discrete file

468

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

468

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

469

.[[1]] %>%

469

.[[1]] %>%

470

.[length(.)] %>%

470

.[length(.)] %>%

471

gsub("\\D","",.) %>%

471

gsub("\\D","",.) %>%

472

c("GSE",.,"dscrt.txt") %>%

472

c("GSE",.,"dscrt.txt") %>%

473

paste(collapse = "")

473

paste(collapse = "")

474

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

474

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

475

476

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Updated: added |^GENE_SYMBOL$ to glossary