Efrain Gonzalez / Cleaning and Fixing Data with R

1

##Posted 6/15/2017

1

##Posted 6/15/2017

2

options(digits = 11)

2

options(digits = 11)

3

4

#Libraries required to run the code

4

#Libraries required to run the code

5

library(pryr)

5

library(pryr)

6

library(MASS)

6

library(MASS)

7

library(dplyr)

7

library(dplyr)

8

library(tidyr)

8

library(tidyr)

9

library(readr)

9

library(readr)

10

library(stringr)

10

library(stringr)

11

12

13

#Necessary Functions

13

#Necessary Functions

14

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

15

chngrownm <- function(mat){

15

chngrownm <- function(mat){

16

row <- dim(mat)[1]

16

row <- dim(mat)[1]

17

col <- dim(mat)[2]

17

col <- dim(mat)[2]

18

j <- 1

18

j <- 1

19

x <- 1

19

x <- 1

20

p <- 1

20

p <- 1

21

a <- 1

21

a <- 1

22

b <- 1

22

b <- 1

23

g <- 1

23

g <- 1

24

for(j in 1:col){

24

for(j in 1:col){

25

if("!Sample_source_name_ch1"==mat[1,j]){

25

if("!Sample_source_name_ch1"==mat[1,j]){

26

colnames(mat)[j] <- "Brain_Region"

26

colnames(mat)[j] <- "Brain_Region"

27

} else if("!Sample_title" == mat[1,j]){

27

} else if("!Sample_title" == mat[1,j]){

28

colnames(mat)[j] <- "Title"

28

colnames(mat)[j] <- "Title"

29

} else if("!Sample_geo_accession" == mat[1,j]){

29

} else if("!Sample_geo_accession" == mat[1,j]){

30

colnames(mat)[j] <- "ID_REF"

30

colnames(mat)[j] <- "ID_REF"

31

} else{

31

} else{

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

33

colnames(mat)[j] <- paste0("Sex",x)

33

colnames(mat)[j] <- paste0("Sex",x)

34

x = x + 1

34

x = x + 1

35

}

35

}

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

37

colnames(mat)[j] <- paste0("PMI",p)

37

colnames(mat)[j] <- paste0("PMI",p)

38

p = p + 1

38

p = p + 1

39

}

39

}

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

41

colnames(mat)[j] <- paste0("Age",a)

41

colnames(mat)[j] <- paste0("Age",a)

42

a = a + 1

42

a = a + 1

43

}

43

}

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

45

colnames(mat)[j] <- paste0("Braak",b)

45

colnames(mat)[j] <- paste0("Braak",b)

46

b = b + 1

46

b = b + 1

47

}

47

}

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

49

colnames(mat)[j] <- paste0("Group",g)

49

colnames(mat)[j] <- paste0("Group",g)

50

g = g + 1

50

g = g + 1

51

}

51

}

52

53

}

53

}

54

j = j + 1

54

j = j + 1

55

}

55

}

56

mat

56

mat

57

}

57

}

58

59

#2#Function for reorganizing information within the columns

59

#2#Function for reorganizing information within the columns

60

cinfo <- function(mat){

60

cinfo <- function(mat){

61

col <- dim(mat)[2]

61

col <- dim(mat)[2]

62

j <-2

62

j <-2

63

for(j in 2:col){

63

for(j in 2:col){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

66

}

66

}

67

if(grepl("Age",colnames(mat)[j])==TRUE){

67

if(grepl("Age",colnames(mat)[j])==TRUE){

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

69

as.integer()

69

as.integer()

70

}

70

}

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

73

}

73

}

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

76

as.numeric()

76

as.numeric()

77

}

77

}

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

80

as.roman()%>%

80

as.roman()%>%

81

as.integer()

81

as.integer()

82

}

82

}

83

j=j+1

83

j=j+1

84

}

84

}

85

mat

85

mat

86

}

86

}

87

88

#3#Function for labeling the gene IDs without names

88

#3#Function for labeling the gene IDs without names

89

NAFIXING <- function(GIDNAM){

89

NAFIXING <- function(GIDNAM){

90

row <- dim(GIDNAM)[1]

90

row <- dim(GIDNAM)[1]

91

i <- 1

91

i <- 1

92

for(i in 1:row){

92

for(i in 1:row){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

94

GIDNAM[i,2] <- GIDNAM[i,1]

94

GIDNAM[i,2] <- GIDNAM[i,1]

95

}

95

}

96

i <- i + 1

96

i <- i + 1

97

}

97

}

98

GIDNAM

98

GIDNAM

99

}

99

}

100

101

#4#Function for changing the gene ID to gene name

101

#4#Function for changing the gene ID to gene name

102

cgeneID <- function(GeneName,DATA){

102

cgeneID <- function(GeneName,DATA){

103

nj <- t(GeneName)

103

nj <- t(GeneName)

104

nq <- t(DATA)

104

nq <- t(DATA)

105

colGene <- dim(nj)[2]

105

colGene <- dim(nj)[2]

106

colDATA <- dim(nq)[2]

106

colDATA <- dim(nq)[2]

107

j <- 1

107

j <- 1

108

for(j in 1:colDATA){

108

for(j in 1:colDATA){

109

#where is that gene id located within the GPL file

109

#where is that gene id located within the GPL file

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

111

if(is.na(sum(chngreq))==FALSE){

111

if(is.na(sum(chngreq))==FALSE){

112

if(sum(chngreq) > 0){

112

if(sum(chngreq) > 0){

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

114

}

114

}

115

}

115

}

116

j <- j + 1

116

j <- j + 1

117

}

117

}

118

nq

118

nq

119

}

119

}

120

#cgeneID <- function(GeneName,DATA){

120

#cgeneID <- function(GeneName,DATA){

121

# colGene <- dim(GeneName)[2]

121

# colGene <- dim(GeneName)[2]

122

# j <- 1

122

# j <- 1

123

# for(j in 1:colGene){

123

# for(j in 1:colGene){

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

125

# if(is.na(sum(chngsreq))==FALSE){

125

# if(is.na(sum(chngsreq))==FALSE){

126

# if(sum(chngsreq) > 0){

126

# if(sum(chngsreq) > 0){

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

128

# }

128

# }

129

# }

129

# }

130

# #if(sum(chngsreq) > 0){

130

# #if(sum(chngsreq) > 0){

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# #}

133

# #}

134

# j = j+1

134

# j = j+1

135

# }

135

# }

136

# DATA

136

# DATA

137

#}

137

#}

138

139

#5#Function for adjusting the gene names

139

#5#Function for adjusting the gene names

140

gcnames <- function(DiData,usecol=1){

140

gcnames <- function(DiData,usecol=1){

141

nuruns <- dim(DiData)[2]

141

nuruns <- dim(DiData)[2]

142

i = 1

142

i = 1

143

nwnam <- rep("0",length.out=nuruns)

143

nwnam <- rep("0",length.out=nuruns)

144

for(i in 1:nuruns){

144

for(i in 1:nuruns){

145

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

145

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

147

} else{

147

} else{

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

149

}

149

}

150

151

}

151

}

152

nwnam

152

nwnam

153

154

}

154

}

155

156

#6# Function for discretizing the data

156

#6# Function for discretizing the data

157

dndat <- function(NDATA){

157

dndat <- function(NDATA){

158

rownd <- dim(NDATA)[1]

158

rownd <- dim(NDATA)[1]

159

colnd <- dim(NDATA)[2]

159

colnd <- dim(NDATA)[2]

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

161

colnames(DDATA) <- colnames(NDATA)

161

colnames(DDATA) <- colnames(NDATA)

162

i <- 1

162

i <- 1

163

for(i in 1:rownd){

163

for(i in 1:rownd){

164

j <- 1

164

j <- 1

165

for(j in 1:colnd){

165

for(j in 1:colnd){

166

if(is.na(NDATA[i,j])==FALSE){

166

if(is.na(NDATA[i,j])==FALSE){

167

168

if(NDATA[i,j] < -1){

168

if(NDATA[i,j] < -1){

169

DDATA[i,j]=0L

169

DDATA[i,j]=0L

170

} else if(NDATA[i,j] > 1){

170

} else if(NDATA[i,j] > 1){

171

DDATA[i,j]=2L

171

DDATA[i,j]=2L

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

173

DDATA[i,j]=1L

173

DDATA[i,j]=1L

174

}

174

}

175

} else{

175

} else{

176

DDATA[i,j] = NDATA[i,j]

176

DDATA[i,j] = NDATA[i,j]

177

}

177

}

178

j = j + 1

178

j = j + 1

179

}

179

}

180

i = i + 1

180

i = i + 1

181

}

181

}

182

DDATA

182

DDATA

183

}

183

}

184

185

186

#The Rest of this code will be used every time you want to change a data set

186

#The Rest of this code will be used every time you want to change a data set

187

188

#Getting the series matrix file

188

#Getting the series matrix file

189

print("Choose the series matrix file that you want to Analyze")

189

print("Choose the series matrix file that you want to Analyze")

190

alz <- file.choose()

190

alz <- file.choose()

191

192

#Getting the GPL file

192

#Getting the GPL file

193

print("Choose the GPL file that correlates with the above series matrix file")

193

print("Choose the GPL file that correlates with the above series matrix file")

194

genena <- file.choose()

194

genena <- file.choose()

195

196

197

#Find out if it is a soft GPL file or not

197

#Find out if it is a soft GPL file or not

198

soft <- strsplit(genena,"[\\|/]") %>%

198

soft <- strsplit(genena,"[\\|/]") %>%

199

.[[1]] %>%

199

.[[1]] %>%

200

.[length(.)] %>%

200

.[length(.)] %>%

201

grepl("soft|annot",.)

201

grepl("soft|annot",.)

202

203

#Working with the wordy part of the document

203

#Working with the wordy part of the document

204

alzword <- alz %>%

204

alzword <- alz %>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

206

filter(grepl("!Sample",X1))%>%

206

filter(grepl("!Sample",X1))%>%

207

filter(!grepl("!Sample_contact",X1))

207

filter(!grepl("!Sample_contact",X1))

208

209

##Changing row names and column names:

209

##Changing row names and column names:

210

ALZWORD <- t(alzword)

210

ALZWORD <- t(alzword)

211

rownames(ALZWORD)=NULL

211

rownames(ALZWORD)=NULL

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

214

ALZWORD <- ALZWORD%>%

214

ALZWORD <- ALZWORD%>%

215

as.data.frame(.,stringsAsFactors = FALSE)%>%

215

as.data.frame(.,stringsAsFactors = FALSE)%>%

216

dplyr::select(-starts_with("col"))

216

dplyr::select(-starts_with("col"))

217

218

##Reorganizing information within the columns

218

##Reorganizing information within the columns

219

ALZWORDF <- cinfo(ALZWORD)

219

ALZWORDF <- cinfo(ALZWORD)

220

221

222

#Working with Actual Data part of file

222

#Working with Actual Data part of file

223

alzdat <- alz %>%

223

alzdat <- alz %>%

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

225

ALZDAT <- t(alzdat[,-1])

225

ALZDAT <- t(alzdat[,-1])

226

rownames(ALZDAT)=NULL

226

rownames(ALZDAT)=NULL

227

228

##Is there a clean version of the GPL file available?

228

##Is there a clean version of the GPL file available?

229

gplnum <- strsplit(genena,"[\\|/]") %>%

229

gplnum <- strsplit(genena,"[\\|/]") %>%

230

.[[1]] %>%

230

.[[1]] %>%

231

.[length(.)] %>%

231

.[length(.)] %>%

232

gsub("\\D","",.)

232

gsub("\\D","",.)

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

234

if(clfileex >= 1){

234

if(clfileex >= 1){

235

#use the clean version

235

#use the clean version

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

238

239

} else if(clfileex == 0){

239

} else if(clfileex == 0){

240

##Lets Create a clean version

240

##Lets Create a clean version

241

242

##Gene ID to Gene Name

242

##Gene ID to Gene Name

243

if(soft == TRUE){

243

if(soft == TRUE){

244

#Check to see if there is already a file containing information on soft files

244

#Check to see if there is already a file containing information on soft files

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

246

if(fileex == 1){

246

if(fileex == 1){

247

#Check to see if this GPL soft file has been used before

247

#Check to see if this GPL soft file has been used before

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

249

.$GPL_FILE_NUM%>%

249

.$GPL_FILE_NUM%>%

250

grepl(gplnum,.) %>%

250

grepl(gplnum,.) %>%

251

sum()

251

sum()

252

if(IDF == 1){

252

if(IDF == 1){

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

254

.$GPL_FILE_NUM%>%

254

.$GPL_FILE_NUM%>%

255

grep(gplnum,.)

255

grep(gplnum,.)

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

257

.$LOC_ID %>%

257

.$LOC_ID %>%

258

.[IDLOCAL]

258

.[IDLOCAL]

259

geneIDNam <- genena %>%

259

geneIDNam <- genena %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

261

262

} else if(IDF == 0){

262

} else if(IDF == 0){

263

#No information on this particular GPL file

263

#No information on this particular GPL file

264

idLOCGPL <- genena %>%

264

idLOCGPL <- genena %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

266

t(.) %>%

266

t(.) %>%

267

grep("^ID\\s*$",.) %>%

267

grep("^ID\\s*$",.) %>%

268

-1

268

-1

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

271

geneIDNam <- genena %>%

271

geneIDNam <- genena %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

273

274

}

274

}

275

} else if(fileex == 0){

275

} else if(fileex == 0){

276

#We must create a file that we can access for later use

276

#We must create a file that we can access for later use

277

idLOCGPL <- genena %>%

277

idLOCGPL <- genena %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

279

t(.) %>%

279

t(.) %>%

280

grep("^ID\\s*$",.) %>%

280

grep("^ID\\s*$",.) %>%

281

-1

281

-1

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

285

geneIDNam <- genena %>%

285

geneIDNam <- genena %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

287

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

287

288

}

288

}

289

} else if(soft == FALSE){

289

} else if(soft == FALSE){

290

geneIDNam <- genena %>%

290

geneIDNam <- genena %>%

291

read_delim(delim="\t",comment = "#")%>%

291

read_delim(delim="\t",comment = "#")%>%

292

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$|^Gene symbol$|^GENE_SYMBOL$",colnames(.)))

292

293

}

293

}

294

295

##Labeling the gene IDs without names

295

##Labeling the gene IDs without names

296

geneIDNam <- NAFIXING(geneIDNam)

296

geneIDNam <- NAFIXING(geneIDNam)

297

298

##remove the whitespace

298

##remove the whitespace

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

300

301

##Here is the clean version

301

##Here is the clean version

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

303

}

303

}

304

305

306

307

##Changing the gene ID to gene name

307

##Changing the gene ID to gene name

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

309

colnames(ALZDAT) = ALZDAT1[1,]

309

colnames(ALZDAT) = ALZDAT1[1,]

310

311

312

##Adjusting the column names aka the gene names

312

##Adjusting the column names aka the gene names

313

colnames(ALZDAT) <- gcnames(ALZDAT)

313

colnames(ALZDAT) <- gcnames(ALZDAT)

314

315

316

#Full RAW Data

316

#Full RAW Data

317

Fullalzdwr <- ALZDAT %>%

317

Fullalzdwr <- ALZDAT %>%

318

as.data.frame(.,stringsAsFactors = FALSE) %>%

318

as.data.frame(.,stringsAsFactors = FALSE) %>%

319

cbind(ALZWORDF,.)

319

cbind(ALZWORDF,.)

320

321

322

#Raw file is output

322

#Raw file is output

323

nfnaex <- strsplit(alz,"[\\]") %>%

323

nfnaex <- strsplit(alz,"[\\]") %>%

324

.[[1]] %>%

324

.[[1]] %>%

325

.[length(.)] %>%

325

.[length(.)] %>%

326

gsub("\\D","",.) %>%

326

gsub("\\D","",.) %>%

327

c("GSE",.,"aftexcel.txt") %>%

327

c("GSE",.,"aftexcel.txt") %>%

328

paste(collapse = "")

328

paste(collapse = "")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

330

331

332

#Now for the discretization part

332

#Now for the discretization part

333

##get the wordy part again

333

##get the wordy part again

334

rawword <- t(ALZWORDF)

334

rawword <- t(ALZWORDF)

335

336

##where is ID_REF located

336

##where is ID_REF located

337

hereim <- grep("ID_REF",rownames(rawword))

337

hereim <- grep("ID_REF",rownames(rawword))

338

339

##Subject Names GSM...

339

##Subject Names GSM...

340

subjnam <- rawword[hereim,]

340

subjnam <- rawword[hereim,]

341

342

##Getting the names for the rows

342

##Getting the names for the rows

343

namedarows <- rownames(rawword)[-hereim] %>%

343

namedarows <- rownames(rawword)[-hereim] %>%

344

as.data.frame(.,stringsAsFactors = FALSE)

344

as.data.frame(.,stringsAsFactors = FALSE)

345

RAWWORD <- rawword[-hereim,] %>%

345

RAWWORD <- rawword[-hereim,] %>%

346

as.data.frame(.,stringsAsFactors = FALSE) %>%

346

as.data.frame(.,stringsAsFactors = FALSE) %>%

347

bind_cols(namedarows,.)

347

bind_cols(namedarows,.)

348

z <- 1

348

z <- 1

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

350

for(z in 1:dim(RAWWORD)[1]){

350

for(z in 1:dim(RAWWORD)[1]){

351

if(sum(is.na(RAWWORD[z,])) > 0){

351

if(sum(is.na(RAWWORD[z,])) > 0){

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

353

}

353

}

354

if(length(grep("NA",RAWWORD[z,])) > 0){

354

if(length(grep("NA",RAWWORD[z,])) > 0){

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

356

}

356

}

357

z <- z + 1

357

z <- z + 1

358

}

358

}

359

360

colnames(naroww) <- "ROW_NAs"

360

colnames(naroww) <- "ROW_NAs"

361

RAWWORD <- bind_cols(RAWWORD,naroww)

361

RAWWORD <- bind_cols(RAWWORD,naroww)

362

363

364

roALZna <- t(ALZDAT) %>%

364

roALZna <- t(ALZDAT) %>%

365

rownames(.) %>%

365

rownames(.) %>%

366

as.data.frame(.,stringsAsFactors = FALSE)

366

as.data.frame(.,stringsAsFactors = FALSE)

367

colnames(roALZna) <- "ID_REF"

367

colnames(roALZna) <- "ID_REF"

368

369

RAWDAT <- t(ALZDAT) %>%

369

RAWDAT <- t(ALZDAT) %>%

370

as.data.frame(.,stringsAsFactors = FALSE)

370

as.data.frame(.,stringsAsFactors = FALSE)

371

colnames(RAWDAT) <- NULL

371

colnames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

373

374

RAWDAT2 <- RAWDAT %>%

374

RAWDAT2 <- RAWDAT %>%

375

cbind(roALZna,.) %>%

375

cbind(roALZna,.) %>%

376

dplyr::arrange(.,ID_REF)

376

dplyr::arrange(.,ID_REF)

377

378

##Editing the file for R processing

378

##Editing the file for R processing

379

RAWDATID <- RAWDAT2[,1] %>%

379

RAWDATID <- RAWDAT2[,1] %>%

380

as.matrix(.)

380

as.matrix(.)

381

382

RAWDATNUM <- RAWDAT2[,-1] %>%

382

RAWDATNUM <- RAWDAT2[,-1] %>%

383

mapply(.,FUN = as.numeric) %>%

383

mapply(.,FUN = as.numeric) %>%

384

t(.)

384

t(.)

385

386

##Consolidating genes with the same name

386

##Consolidating genes with the same name

387

###create empty matrix of size equal to tabRDATID

387

###create empty matrix of size equal to tabRDATID

388

tabRDATID <- table(RAWDATID)

388

tabRDATID <- table(RAWDATID)

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

390

j <- 1

390

j <- 1

391

for(j in 1:length(tabRDATID)){

391

for(j in 1:length(tabRDATID)){

392

393

##Putting the ones without duplicates in their new homes

393

##Putting the ones without duplicates in their new homes

394

if(tabRDATID[j] == 1){

394

if(tabRDATID[j] == 1){

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

396

} else if(tabRDATID[j] > 1){

396

} else if(tabRDATID[j] > 1){

397

##Averaging duplicates and putting them in their new homes

397

##Averaging duplicates and putting them in their new homes

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

399

}

399

}

400

j <- j + 1

400

j <- j + 1

401

}

401

}

402

403

##Outputting non Z-score Average over genes

403

##Outputting non Z-score Average over genes

404

newoutput <-NuRDATN

404

newoutput <-NuRDATN

405

colnames(newoutput) <- rownames(tabRDATID)

405

colnames(newoutput) <- rownames(tabRDATID)

406

nfnewout <- strsplit(alz,"[\\]") %>%

406

nfnewout <- strsplit(alz,"[\\]") %>%

407

.[[1]] %>%

407

.[[1]] %>%

408

.[length(.)] %>%

408

.[length(.)] %>%

409

gsub("\\D","",.) %>%

409

gsub("\\D","",.) %>%

410

c("GSE",.,"avg.txt") %>%

410

c("GSE",.,"avg.txt") %>%

411

paste(collapse = "")

411

paste(collapse = "")

412

noutput <- newoutput %>%

412

noutput <- newoutput %>%

413

t()%>%

413

t()%>%

414

as.data.frame(.,stringsAsFactors = FALSE)

414

as.data.frame(.,stringsAsFactors = FALSE)

415

noutput <- cbind(rownames(noutput),noutput)

415

noutput <- cbind(rownames(noutput),noutput)

416

colnames(noutput) <- c("Gene Symbol",subjnam)

416

colnames(noutput) <- c("Gene Symbol",subjnam)

417

rownames(noutput) <- NULL

417

rownames(noutput) <- NULL

418

write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)

418

write.table(noutput, file = nfnewout, sep = "\t",col.names = TRUE,row.names = FALSE)

419

420

421

##Scaling the Data

421

##Scaling the Data

422

scrawdat <- NuRDATN%>%

422

scrawdat <- NuRDATN%>%

423

scale()

423

scale()

424

attr(scrawdat,"scaled:center") <- NULL

424

attr(scrawdat,"scaled:center") <- NULL

425

attr(scrawdat,"scaled:scale") <- NULL

425

attr(scrawdat,"scaled:scale") <- NULL

426

colnames(scrawdat) <- rownames(tabRDATID)

426

colnames(scrawdat) <- rownames(tabRDATID)

427

428

#Outputting the Z-score file

428

#Outputting the Z-score file

429

nfnzsc <- strsplit(alz,"[\\]") %>%

429

nfnzsc <- strsplit(alz,"[\\]") %>%

430

.[[1]] %>%

430

.[[1]] %>%

431

.[length(.)] %>%

431

.[length(.)] %>%

432

gsub("\\D","",.) %>%

432

gsub("\\D","",.) %>%

433

c("GSE",.,"zscore.txt") %>%

433

c("GSE",.,"zscore.txt") %>%

434

paste(collapse = "")

434

paste(collapse = "")

435

zscraw <- scrawdat %>%

435

zscraw <- scrawdat %>%

436

t()%>%

436

t()%>%

437

as.data.frame(.,stringsAsFactors = FALSE)

437

as.data.frame(.,stringsAsFactors = FALSE)

438

zscraw <- cbind(rownames(zscraw),zscraw)

438

zscraw <- cbind(rownames(zscraw),zscraw)

439

colnames(zscraw) <- c("Gene Symbol",subjnam)

439

colnames(zscraw) <- c("Gene Symbol",subjnam)

440

rownames(zscraw) <- NULL

440

rownames(zscraw) <- NULL

441

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)

441

write.table(zscraw, file = nfnzsc, sep = "\t",col.names = TRUE,row.names = FALSE)

442

443

444

##Discretized the Data

444

##Discretized the Data

445

dialzdat <- scrawdat %>%

445

dialzdat <- scrawdat %>%

446

dndat(.) %>%

446

dndat(.) %>%

447

t()%>%

447

t()%>%

448

as.data.frame(.,stringsAsFactors = FALSE)

448

as.data.frame(.,stringsAsFactors = FALSE)

449

colnames(dialzdat) <- rownames(RAWDATNUM)

449

colnames(dialzdat) <- rownames(RAWDATNUM)

450

451

##setting "ID_REF" as a new variable

451

##setting "ID_REF" as a new variable

452

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

452

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

453

colnames(geneNAM) <- "ID_REF"

453

colnames(geneNAM) <- "ID_REF"

454

rownames(dialzdat) <- NULL

454

rownames(dialzdat) <- NULL

455

dialzdat <-bind_cols(geneNAM,dialzdat)

455

dialzdat <-bind_cols(geneNAM,dialzdat)

456

457

##NAs in a column

457

##NAs in a column

458

x <- 2

458

x <- 2

459

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

459

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

460

nacol[1,1] = "COL_NAs"

460

nacol[1,1] = "COL_NAs"

461

for(x in 2:dim(dialzdat)[2]){

461

for(x in 2:dim(dialzdat)[2]){

462

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

462

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

463

x <- x + 1

463

x <- x + 1

464

}

464

}

465

colnames(nacol) <- colnames(dialzdat)

465

colnames(nacol) <- colnames(dialzdat)

466

dialzdat<-bind_rows(dialzdat,nacol)

466

dialzdat<-bind_rows(dialzdat,nacol)

467

468

##NAs in a row

468

##NAs in a row

469

y <- 1

469

y <- 1

470

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

470

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

471

for(y in 1:dim(dialzdat)[1]){

471

for(y in 1:dim(dialzdat)[1]){

472

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

472

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

473

y <- y + 1

473

y <- y + 1

474

}

474

}

475

colnames(narowd) <- "ROW_NAs"

475

colnames(narowd) <- "ROW_NAs"

476

dialzdat <- bind_cols(dialzdat,narowd)

476

dialzdat <- bind_cols(dialzdat,narowd)

477

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

477

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

478

colnames(RAWWORD) <- colnames(dialzdat)

478

colnames(RAWWORD) <- colnames(dialzdat)

479

##converting to character so that the clinical can be brought together with discrete data

479

##converting to character so that the clinical can be brought together with discrete data

480

k <- 2

480

k <- 2

481

for(k in 2:dim(dialzdat)[2]-1){

481

for(k in 2:dim(dialzdat)[2]-1){

482

dialzdat[,k] <- as.character(dialzdat[,k])

482

dialzdat[,k] <- as.character(dialzdat[,k])

483

k <- k + 1

483

k <- k + 1

484

}

484

}

485

#The End the full data

485

#The End the full data

486

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

486

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

487

488

#Produces Discrete file

488

#Produces Discrete file

489

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

489

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

490

.[[1]] %>%

490

.[[1]] %>%

491

.[length(.)] %>%

491

.[length(.)] %>%

492

gsub("\\D","",.) %>%

492

gsub("\\D","",.) %>%

493

c("GSE",.,"dscrt.txt") %>%

493

c("GSE",.,"dscrt.txt") %>%

494

paste(collapse = "")

494

paste(collapse = "")

495

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

495

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

496

497

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Added |^UCSC_RefGene_Name$ to list of potential names in GPL