Efrain Gonzalez / Cleaning and Fixing Data with R

1

##Posted 6/15/2017

1

##Posted 6/15/2017

2

options(digits = 11)

2

options(digits = 11)

3

4

#Libraries required to run the code

4

#Libraries required to run the code

5

library(pryr)

5

library(pryr)

6

library(MASS)

6

library(MASS)

7

library(dplyr)

7

library(dplyr)

8

library(tidyr)

8

library(tidyr)

9

library(readr)

9

library(readr)

10

library(stringr)

10

library(stringr)

11

12

13

#Necessary Functions

13

#Necessary Functions

14

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

15

chngrownm <- function(mat){

15

chngrownm <- function(mat){

16

row <- dim(mat)[1]

16

row <- dim(mat)[1]

17

col <- dim(mat)[2]

17

col <- dim(mat)[2]

18

j <- 1

18

j <- 1

19

x <- 1

19

x <- 1

20

p <- 1

20

p <- 1

21

a <- 1

21

a <- 1

22

b <- 1

22

b <- 1

23

g <- 1

23

g <- 1

24

for(j in 1:col){

24

for(j in 1:col){

25

if("!Sample_source_name_ch1"==mat[1,j]){

25

if("!Sample_source_name_ch1"==mat[1,j]){

26

colnames(mat)[j] <- "Brain_Region"

26

colnames(mat)[j] <- "Brain_Region"

27

} else if("!Sample_title" == mat[1,j]){

27

} else if("!Sample_title" == mat[1,j]){

28

colnames(mat)[j] <- "Title"

28

colnames(mat)[j] <- "Title"

29

} else if("!Sample_geo_accession" == mat[1,j]){

29

} else if("!Sample_geo_accession" == mat[1,j]){

30

colnames(mat)[j] <- "ID_REF"

30

colnames(mat)[j] <- "ID_REF"

31

} else{

31

} else{

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

33

colnames(mat)[j] <- paste0("Sex",x)

33

colnames(mat)[j] <- paste0("Sex",x)

34

x = x + 1

34

x = x + 1

35

}

35

}

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

37

colnames(mat)[j] <- paste0("PMI",p)

37

colnames(mat)[j] <- paste0("PMI",p)

38

p = p + 1

38

p = p + 1

39

}

39

}

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

41

colnames(mat)[j] <- paste0("Age",a)

41

colnames(mat)[j] <- paste0("Age",a)

42

a = a + 1

42

a = a + 1

43

}

43

}

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

if(grepl("braak|b&b",mat[2,j])==TRUE){

45

colnames(mat)[j] <- paste0("Braak",b)

45

colnames(mat)[j] <- paste0("Braak",b)

46

b = b + 1

46

b = b + 1

47

}

47

}

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

48

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

49

colnames(mat)[j] <- paste0("Group",g)

49

colnames(mat)[j] <- paste0("Group",g)

50

g = g + 1

50

g = g + 1

51

}

51

}

52

53

}

53

}

54

j = j + 1

54

j = j + 1

55

}

55

}

56

mat

56

mat

57

}

57

}

58

59

#2#Function for reorganizing information within the columns

59

#2#Function for reorganizing information within the columns

60

cinfo <- function(mat){

60

cinfo <- function(mat){

61

col <- dim(mat)[2]

61

col <- dim(mat)[2]

62

j <-2

62

j <-2

63

for(j in 2:col){

63

for(j in 2:col){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

if(grepl("Group",colnames(mat)[j]) == TRUE){

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

66

}

66

}

67

if(grepl("Age",colnames(mat)[j])==TRUE){

67

if(grepl("Age",colnames(mat)[j])==TRUE){

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

mat[,j] <- gsub("\\D","",mat[,j])%>%

69

as.integer()

69

as.integer()

70

}

70

}

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

if(grepl("Sex",colnames(mat)[j])==TRUE){

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

mat[,j] <- gsub(".+:\\s","",mat[,j])

73

}

73

}

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

if(grepl("PMI",colnames(mat)[j])==TRUE){

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

76

as.numeric()

76

as.numeric()

77

}

77

}

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

if(grepl("Braak",colnames(mat)[j])==TRUE){

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

80

as.roman()%>%

80

as.roman()%>%

81

as.integer()

81

as.integer()

82

}

82

}

83

j=j+1

83

j=j+1

84

}

84

}

85

mat

85

mat

86

}

86

}

87

88

#3#Function for labeling the gene IDs without names

88

#3#Function for labeling the gene IDs without names

89

NAFIXING <- function(GIDNAM){

89

NAFIXING <- function(GIDNAM){

90

row <- dim(GIDNAM)[1]

90

row <- dim(GIDNAM)[1]

91

i <- 1

91

i <- 1

92

for(i in 1:row){

92

for(i in 1:row){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

94

GIDNAM[i,2] <- GIDNAM[i,1]

94

GIDNAM[i,2] <- GIDNAM[i,1]

95

}

95

}

96

i <- i + 1

96

i <- i + 1

97

}

97

}

98

GIDNAM

98

GIDNAM

99

}

99

}

100

101

#4#Function for changing the gene ID to gene name

101

#4#Function for changing the gene ID to gene name

102

cgeneID <- function(GeneName,DATA){

102

cgeneID <- function(GeneName,DATA){

103

nj <- t(GeneName)

103

nj <- t(GeneName)

104

nq <- t(DATA)

104

nq <- t(DATA)

105

colGene <- dim(nj)[2]

105

colGene <- dim(nj)[2]

106

colDATA <- dim(nq)[2]

106

colDATA <- dim(nq)[2]

107

j <- 1

107

j <- 1

108

for(j in 1:colDATA){

108

for(j in 1:colDATA){

109

#where is that gene id located within the GPL file

109

#where is that gene id located within the GPL file

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

110

chngreq <- grep(paste0("^",nq[1,j],"$"),nj[1,])

111

if(is.na(sum(chngreq))==FALSE){

111

if(is.na(sum(chngreq))==FALSE){

112

if(sum(chngreq) > 0){

112

if(sum(chngreq) > 0){

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

113

nq[1,j] <- gsub(paste0("^",nq[1,j],"$"),nj[2,chngreq],nq[1,j])

114

}

114

}

115

}

115

}

116

j <- j + 1

116

j <- j + 1

117

}

117

}

118

nq

118

nq

119

}

119

}

120

#cgeneID <- function(GeneName,DATA){

120

#cgeneID <- function(GeneName,DATA){

121

# colGene <- dim(GeneName)[2]

121

# colGene <- dim(GeneName)[2]

122

# j <- 1

122

# j <- 1

123

# for(j in 1:colGene){

123

# for(j in 1:colGene){

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

124

# chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

125

# if(is.na(sum(chngsreq))==FALSE){

125

# if(is.na(sum(chngsreq))==FALSE){

126

# if(sum(chngsreq) > 0){

126

# if(sum(chngsreq) > 0){

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

# DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

128

# }

128

# }

129

# }

129

# }

130

# #if(sum(chngsreq) > 0){

130

# #if(sum(chngsreq) > 0){

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

131

# ##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

132

# #DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

133

# #}

133

# #}

134

# j = j+1

134

# j = j+1

135

# }

135

# }

136

# DATA

136

# DATA

137

#}

137

#}

138

139

#5#Function for adjusting the gene names

139

#5#Function for adjusting the gene names

140

gcnames <- function(DiData,usecol=1){

140

gcnames <- function(DiData,usecol=1){

141

nuruns <- dim(DiData)[2]

141

nuruns <- dim(DiData)[2]

142

i = 1

142

i = 1

143

nwnam <- rep("0",length.out=nuruns)

143

nwnam <- rep("0",length.out=nuruns)

144

for(i in 1:nuruns){

144

for(i in 1:nuruns){

145

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

145

if(length(strsplit(colnames(DiData)[i],"///|//")[[1]]) >= usecol){

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

146

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][usecol])

147

} else{

147

} else{

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

148

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///|//")[[1]][1])

149

}

149

}

150

151

}

151

}

152

nwnam

152

nwnam

153

154

}

154

}

155

156

#6# Function for discretizing the data

156

#6# Function for discretizing the data

157

dndat <- function(NDATA){

157

dndat <- function(NDATA){

158

rownd <- dim(NDATA)[1]

158

rownd <- dim(NDATA)[1]

159

colnd <- dim(NDATA)[2]

159

colnd <- dim(NDATA)[2]

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

160

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

161

colnames(DDATA) <- colnames(NDATA)

161

colnames(DDATA) <- colnames(NDATA)

162

i <- 1

162

i <- 1

163

for(i in 1:rownd){

163

for(i in 1:rownd){

164

j <- 1

164

j <- 1

165

for(j in 1:colnd){

165

for(j in 1:colnd){

166

if(is.na(NDATA[i,j])==FALSE){

166

if(is.na(NDATA[i,j])==FALSE){

167

168

if(NDATA[i,j] < -1){

168

if(NDATA[i,j] < -1){

169

DDATA[i,j]=0L

169

DDATA[i,j]=0L

170

} else if(NDATA[i,j] > 1){

170

} else if(NDATA[i,j] > 1){

171

DDATA[i,j]=2L

171

DDATA[i,j]=2L

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

172

} else if(-1 <= NDATA[i,j] && NDATA[i,j] <= 1){

173

DDATA[i,j]=1L

173

DDATA[i,j]=1L

174

}

174

}

175

} else{

175

} else{

176

DDATA[i,j] = NDATA[i,j]

176

DDATA[i,j] = NDATA[i,j]

177

}

177

}

178

j = j + 1

178

j = j + 1

179

}

179

}

180

i = i + 1

180

i = i + 1

181

}

181

}

182

DDATA

182

DDATA

183

}

183

}

184

185

186

#The Rest of this code will be used every time you want to change a data set

186

#The Rest of this code will be used every time you want to change a data set

187

188

#Getting the series matrix file

188

#Getting the series matrix file

189

print("Choose the series matrix file that you want to Analyze")

189

print("Choose the series matrix file that you want to Analyze")

190

alz <- file.choose()

190

alz <- file.choose()

191

192

#Getting the GPL file

192

#Getting the GPL file

193

print("Choose the GPL file that correlates with the above series matrix file")

193

print("Choose the GPL file that correlates with the above series matrix file")

194

genena <- file.choose()

194

genena <- file.choose()

195

196

197

#Find out if it is a soft GPL file or not

197

#Find out if it is a soft GPL file or not

198

soft <- strsplit(genena,"[\\|/]") %>%

198

soft <- strsplit(genena,"[\\|/]") %>%

199

.[[1]] %>%

199

.[[1]] %>%

200

.[length(.)] %>%

200

.[length(.)] %>%

201

grepl("soft|annot",.)

201

grepl("soft|annot",.)

202

203

#Working with the wordy part of the document

203

#Working with the wordy part of the document

204

alzword <- alz %>%

204

alzword <- alz %>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

205

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

206

filter(grepl("!Sample",X1))%>%

206

filter(grepl("!Sample",X1))%>%

207

filter(!grepl("!Sample_contact",X1))

207

filter(!grepl("!Sample_contact",X1))

208

209

##Changing row names and column names:

209

##Changing row names and column names:

210

ALZWORD <- t(alzword)

210

ALZWORD <- t(alzword)

211

rownames(ALZWORD)=NULL

211

rownames(ALZWORD)=NULL

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

212

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

213

ALZWORD <- chngrownm(ALZWORD)[-1,]

214

ALZWORD <- ALZWORD%>%

214

ALZWORD <- ALZWORD%>%

215

as.data.frame()%>%

215

as.data.frame(.,stringsAsFactors = FALSE)%>%

216

dplyr::select(-starts_with("col"))

216

dplyr::select(-starts_with("col"))

217

218

##Reorganizing information within the columns

218

##Reorganizing information within the columns

219

ALZWORDF <- cinfo(ALZWORD)

219

ALZWORDF <- cinfo(ALZWORD)

220

221

222

#Working with Actual Data part of file

222

#Working with Actual Data part of file

223

alzdat <- alz %>%

223

alzdat <- alz %>%

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

224

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

225

ALZDAT <- t(alzdat[,-1])

225

ALZDAT <- t(alzdat[,-1])

226

rownames(ALZDAT)=NULL

226

rownames(ALZDAT)=NULL

227

228

##Is there a clean version of the GPL file available?

228

##Is there a clean version of the GPL file available?

229

gplnum <- strsplit(genena,"[\\|/]") %>%

229

gplnum <- strsplit(genena,"[\\|/]") %>%

230

.[[1]] %>%

230

.[[1]] %>%

231

.[length(.)] %>%

231

.[length(.)] %>%

232

gsub("\\D","",.)

232

gsub("\\D","",.)

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

233

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

234

if(clfileex >= 1){

234

if(clfileex >= 1){

235

#use the clean version

235

#use the clean version

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

236

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

237

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

238

239

} else if(clfileex == 0){

239

} else if(clfileex == 0){

240

##Lets Create a clean version

240

##Lets Create a clean version

241

242

##Gene ID to Gene Name

242

##Gene ID to Gene Name

243

if(soft == TRUE){

243

if(soft == TRUE){

244

#Check to see if there is already a file containing information on soft files

244

#Check to see if there is already a file containing information on soft files

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

245

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

246

if(fileex == 1){

246

if(fileex == 1){

247

#Check to see if this GPL soft file has been used before

247

#Check to see if this GPL soft file has been used before

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

248

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

249

.$GPL_FILE_NUM%>%

249

.$GPL_FILE_NUM%>%

250

grepl(gplnum,.) %>%

250

grepl(gplnum,.) %>%

251

sum()

251

sum()

252

if(IDF == 1){

252

if(IDF == 1){

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

253

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

254

.$GPL_FILE_NUM%>%

254

.$GPL_FILE_NUM%>%

255

grep(gplnum,.)

255

grep(gplnum,.)

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

256

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

257

.$LOC_ID %>%

257

.$LOC_ID %>%

258

.[IDLOCAL]

258

.[IDLOCAL]

259

geneIDNam <- genena %>%

259

geneIDNam <- genena %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

262

} else if(IDF == 0){

262

} else if(IDF == 0){

263

#No information on this particular GPL file

263

#No information on this particular GPL file

264

idLOCGPL <- genena %>%

264

idLOCGPL <- genena %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

265

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

266

t(.) %>%

266

t(.) %>%

267

grep("^ID\\s*$",.) %>%

267

grep("^ID\\s*$",.) %>%

268

-1

268

-1

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

269

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

270

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

271

geneIDNam <- genena %>%

271

geneIDNam <- genena %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

272

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

273

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

274

}

274

}

275

} else if(fileex == 0){

275

} else if(fileex == 0){

276

#We must create a file that we can access for later use

276

#We must create a file that we can access for later use

277

idLOCGPL <- genena %>%

277

idLOCGPL <- genena %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

278

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

279

t(.) %>%

279

t(.) %>%

280

grep("^ID\\s*$",.) %>%

280

grep("^ID\\s*$",.) %>%

281

-1

281

-1

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

282

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

283

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

284

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

285

geneIDNam <- genena %>%

285

geneIDNam <- genena %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

286

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

287

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

287

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

288

}

288

}

289

} else if(soft == FALSE){

289

} else if(soft == FALSE){

290

geneIDNam <- genena %>%

290

geneIDNam <- genena %>%

291

read_delim(delim="\t",comment = "#")%>%

291

read_delim(delim="\t",comment = "#")%>%

292

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

292

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

293

}

293

}

294

295

##Labeling the gene IDs without names

295

##Labeling the gene IDs without names

296

geneIDNam <- NAFIXING(geneIDNam)

296

geneIDNam <- NAFIXING(geneIDNam)

297

298

##remove the whitespace

298

##remove the whitespace

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

299

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

300

301

##Here is the clean version

301

##Here is the clean version

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

302

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

303

}

303

}

304

305

306

307

##Changing the gene ID to gene name

307

##Changing the gene ID to gene name

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

308

ALZDAT1 <- cgeneID(geneIDNam,alzdat)

309

colnames(ALZDAT) = ALZDAT1[1,]

309

colnames(ALZDAT) = ALZDAT1[1,]

310

311

312

##Adjusting the column names aka the gene names

312

##Adjusting the column names aka the gene names

313

colnames(ALZDAT) <- gcnames(ALZDAT)

313

colnames(ALZDAT) <- gcnames(ALZDAT)

314

315

316

#Full RAW Data

316

#Full RAW Data

317

Fullalzdwr <- ALZDAT %>%

317

Fullalzdwr <- ALZDAT %>%

318

as.data.frame() %>%

318

as.data.frame() %>%

319

cbind(ALZWORDF,.)

319

cbind(ALZWORDF,.)

320

321

322

#Raw file is output

322

#Raw file is output

323

nfnaex <- strsplit(alz,"[\\]") %>%

323

nfnaex <- strsplit(alz,"[\\]") %>%

324

.[[1]] %>%

324

.[[1]] %>%

325

.[length(.)] %>%

325

.[length(.)] %>%

326

gsub("\\D","",.) %>%

326

gsub("\\D","",.) %>%

327

c("GSE",.,"aftexcel.txt") %>%

327

c("GSE",.,"aftexcel.txt") %>%

328

paste(collapse = "")

328

paste(collapse = "")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

329

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

330

331

332

#Now for the discretization part

332

#Now for the discretization part

333

##get the wordy part again

333

##get the wordy part again

334

rawword <- t(ALZWORDF)

334

rawword <- t(ALZWORDF)

335

336

##where is ID_REF located

336

##where is ID_REF located

337

hereim <- grep("ID_REF",rownames(rawword))

337

hereim <- grep("ID_REF",rownames(rawword))

338

339

##Subject Names GSM...

339

##Subject Names GSM...

340

subjnam <- rawword[hereim,]

340

subjnam <- rawword[hereim,]

341

342

##Getting the names for the rows

342

##Getting the names for the rows

343

namedarows <- rownames(rawword)[-hereim] %>%

343

namedarows <- rownames(rawword)[-hereim] %>%

344

as.data.frame()

344

as.data.frame(.,stringsAsFactors = FALSE)

345

RAWWORD <- rawword[-hereim,] %>%

345

RAWWORD <- rawword[-hereim,] %>%

346

as.data.frame() %>%

346

as.data.frame(.,stringsAsFactors = FALSE) %>%

347

bind_cols(namedarows,.)

347

bind_cols(namedarows,.)

348

z <- 1

348

z <- 1

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

349

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

350

for(z in 1:dim(RAWWORD)[1]){

350

for(z in 1:dim(RAWWORD)[1]){

351

if(sum(is.na(RAWWORD[z,])) > 0){

351

if(sum(is.na(RAWWORD[z,])) > 0){

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

352

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

353

}

353

}

354

if(length(grep("NA",RAWWORD[z,])) > 0){

354

if(length(grep("NA",RAWWORD[z,])) > 0){

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

355

naroww[z,1] <- as.integer(length(grep("NA",RAWWORD[z,]))) + naroww[z,1]

356

}

356

}

357

z <- z + 1

357

z <- z + 1

358

}

358

}

359

360

colnames(naroww) <- "ROW_NAs"

360

colnames(naroww) <- "ROW_NAs"

361

RAWWORD <- bind_cols(RAWWORD,naroww)

361

RAWWORD <- bind_cols(RAWWORD,naroww)

362

363

364

roALZna <- t(ALZDAT) %>%

364

roALZna <- t(ALZDAT) %>%

365

rownames(.) %>%

365

rownames(.) %>%

366

as.data.frame(.)

366

as.data.frame(.,stringsAsFactors = FALSE)

367

colnames(roALZna) <- "ID_REF"

367

colnames(roALZna) <- "ID_REF"

368

369

RAWDAT <- t(ALZDAT) %>%

369

RAWDAT <- t(ALZDAT) %>%

370

as.data.frame(.)

370

as.data.frame(.,stringsAsFactors = FALSE)

371

colnames(RAWDAT) <- NULL

371

colnames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

372

rownames(RAWDAT) <- NULL

373

374

RAWDAT2 <- RAWDAT %>%

374

RAWDAT2 <- RAWDAT %>%

375

cbind(roALZna,.) %>%

375

cbind(roALZna,.) %>%

376

dplyr::arrange(.,ID_REF)

376

dplyr::arrange(.,ID_REF)

377

378

##Editing the file for R processing

378

##Editing the file for R processing

379

RAWDATID <- RAWDAT2[,1] %>%

379

RAWDATID <- RAWDAT2[,1] %>%

380

as.matrix(.)

380

as.matrix(.)

381

382

RAWDATNUM <- RAWDAT2[,-1] %>%

382

RAWDATNUM <- RAWDAT2[,-1] %>%

383

mapply(.,FUN = as.numeric) %>%

383

mapply(.,FUN = as.numeric) %>%

384

t(.)

384

t(.)

385

386

##Consolidating genes with the same name

386

##Consolidating genes with the same name

387

###create empty matrix of size equal to tabRDATID

387

###create empty matrix of size equal to tabRDATID

388

tabRDATID <- table(RAWDATID)

388

tabRDATID <- table(RAWDATID)

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

389

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

390

j <- 1

390

j <- 1

391

for(j in 1:length(tabRDATID)){

391

for(j in 1:length(tabRDATID)){

392

393

##Putting the ones without duplicates in their new homes

393

##Putting the ones without duplicates in their new homes

394

if(tabRDATID[j] == 1){

394

if(tabRDATID[j] == 1){

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

395

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

396

} else if(tabRDATID[j] > 1){

396

} else if(tabRDATID[j] > 1){

397

##Averaging duplicates and putting them in their new homes

397

##Averaging duplicates and putting them in their new homes

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

398

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

399

}

399

}

400

j <- j + 1

400

j <- j + 1

401

}

401

}

402

403

##Scaling the Data

403

##Scaling the Data

404

scrawdat <- NuRDATN%>%

404

scrawdat <- NuRDATN%>%

405

scale()

405

scale()

406

attr(scrawdat,"scaled:center") <- NULL

406

attr(scrawdat,"scaled:center") <- NULL

407

attr(scrawdat,"scaled:scale") <- NULL

407

attr(scrawdat,"scaled:scale") <- NULL

408

colnames(scrawdat) <- rownames(tabRDATID)

408

colnames(scrawdat) <- rownames(tabRDATID)

409

410

##Discretized the Data

410

##Discretized the Data

411

dialzdat <- scrawdat %>%

411

dialzdat <- scrawdat %>%

412

dndat(.) %>%

412

dndat(.) %>%

413

t()%>%

413

t()%>%

414

as.data.frame(.)

414

as.data.frame(.,stringsAsFactors = FALSE)

415

colnames(dialzdat) <- rownames(RAWDATNUM)

415

colnames(dialzdat) <- rownames(RAWDATNUM)

416

417

##setting "ID_REF" as a new variable

417

##setting "ID_REF" as a new variable

418

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

418

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1),stringsAsFactors = FALSE)

419

colnames(geneNAM) <- "ID_REF"

419

colnames(geneNAM) <- "ID_REF"

420

rownames(dialzdat) <- NULL

420

rownames(dialzdat) <- NULL

421

dialzdat <-bind_cols(geneNAM,dialzdat)

421

dialzdat <-bind_cols(geneNAM,dialzdat)

422

423

##NAs in a column

423

##NAs in a column

424

x <- 2

424

x <- 2

425

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

425

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

426

nacol[1,1] = "COL_NAs"

426

nacol[1,1] = "COL_NAs"

427

for(x in 2:dim(dialzdat)[2]){

427

for(x in 2:dim(dialzdat)[2]){

428

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

428

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

429

x <- x + 1

429

x <- x + 1

430

}

430

}

431

colnames(nacol) <- colnames(dialzdat)

431

colnames(nacol) <- colnames(dialzdat)

432

dialzdat<-bind_rows(dialzdat,nacol)

432

dialzdat<-bind_rows(dialzdat,nacol)

433

434

##NAs in a row

434

##NAs in a row

435

y <- 1

435

y <- 1

436

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

436

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

437

for(y in 1:dim(dialzdat)[1]){

437

for(y in 1:dim(dialzdat)[1]){

438

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

438

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

439

y <- y + 1

439

y <- y + 1

440

}

440

}

441

colnames(narowd) <- "ROW_NAs"

441

colnames(narowd) <- "ROW_NAs"

442

dialzdat <- bind_cols(dialzdat,narowd)

442

dialzdat <- bind_cols(dialzdat,narowd)

443

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

443

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

444

colnames(RAWWORD) <- colnames(dialzdat)

444

colnames(RAWWORD) <- colnames(dialzdat)

445

##converting to character so that the clinical can be brought together with discrete data

445

##converting to character so that the clinical can be brought together with discrete data

446

k <- 2

446

k <- 2

447

for(k in 2:dim(dialzdat)[2]-1){

447

for(k in 2:dim(dialzdat)[2]-1){

448

dialzdat[,k] <- as.character(dialzdat[,k])

448

dialzdat[,k] <- as.character(dialzdat[,k])

449

k <- k + 1

449

k <- k + 1

450

}

450

}

451

#The End the full data

451

#The End the full data

452

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

452

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

453

454

#Produces Discrete file

454

#Produces Discrete file

455

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

455

nfnaex2 <- strsplit(alz,"[\\|/]") %>%

456

.[[1]] %>%

456

.[[1]] %>%

457

.[length(.)] %>%

457

.[length(.)] %>%

458

gsub("\\D","",.) %>%

458

gsub("\\D","",.) %>%

459

c("GSE",.,"dscrt.txt") %>%

459

c("GSE",.,"dscrt.txt") %>%

460

paste(collapse = "")

460

paste(collapse = "")

461

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

461

write.table(Dscrtalzdw, file = nfnaex2, sep = "\t",col.names = TRUE,row.names = FALSE)

462

463

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Updating handling of strings in as.data.frame function