Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(pryr)

2

library(pryr)

3

library(MASS)

3

library(MASS)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

for(i in 1:row){

91

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

93

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

94

}

95

i <- i + 1

95

i <- i + 1

96

}

96

}

97

GIDNAM

97

GIDNAM

98

}

98

}

99

100

#4#Function for changing the gene ID to gene name

100

#4#Function for changing the gene ID to gene name

101

cgeneID <- function(GeneName,DATA){

101

cgeneID <- function(GeneName,DATA){

102

colGene <- dim(GeneName)[2]

102

colGene <- dim(GeneName)[2]

103

j <- 1

103

j <- 1

104

for(j in 1:colGene){

104

for(j in 1:colGene){

105

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

105

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

106

if(is.na(sum(chngsreq))==FALSE){

106

if(is.na(sum(chngsreq))==FALSE){

107

if(sum(chngsreq) > 0){

107

if(sum(chngsreq) > 0){

108

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

108

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

109

}

109

}

110

}

110

}

111

j = j+1

111

j = j+1

112

}

112

}

113

DATA

113

DATA

114

}

114

}

115

116

#5#Function for adjusting the gene names

116

#5#Function for adjusting the gene names

117

gcnames <- function(DiData,usecol=1){

117

gcnames <- function(DiData,usecol=1){

118

nuruns <- dim(DiData)[2]

118

nuruns <- dim(DiData)[2]

119

i = 1

119

i = 1

120

nwnam <- rep("0",length.out=nuruns)

120

nwnam <- rep("0",length.out=nuruns)

121

for(i in 1:nuruns){

121

for(i in 1:nuruns){

122

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

122

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

123

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

123

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

124

} else{

124

} else{

125

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

125

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

126

}

126

}

127

128

}

128

}

129

nwnam

129

nwnam

130

131

}

131

}

132

133

134

135

#The Rest of this code will be used every time you want to change a data set

135

#The Rest of this code will be used every time you want to change a data set

136

137

#Getting the series matrix file

137

#Getting the series matrix file

138

print("Choose the series matrix file that you want to Analyze")

138

print("Choose the series matrix file that you want to Analyze")

139

alz <- file.choose()

139

alz <- file.choose()

140

141

#Getting the GPL file

141

#Getting the GPL file

142

print("Choose the GPL file that correlates with the above series matrix file")

142

print("Choose the GPL file that correlates with the above series matrix file")

143

genena <- file.choose()

143

genena <- file.choose()

144

145

146

#Set working directory based on the directory of the series matrix file Currently only works for windows

146

#Set working directory based on the directory of the series matrix file Currently only works for windows

147

##strsplit(alz,"[\\]") %>%

147

##strsplit(alz,"[\\]") %>%

148

## .[[1]] %>%

148

## .[[1]] %>%

149

## .[-length(.)] %>%

149

## .[-length(.)] %>%

150

## paste(.,collapse="/") %>%

150

## paste(.,collapse="/") %>%

151

## setwd()

151

## setwd()

152

153

#Find out if it is a soft GPL file or not

153

#Find out if it is a soft GPL file or not

154

soft <- strsplit(genena,"[\\|/]") %>%

154

soft <- strsplit(genena,"[\\|/]") %>%

155

.[[1]] %>%

155

.[[1]] %>%

156

.[length(.)] %>%

156

.[length(.)] %>%

157

grepl("soft|annot",.)

157

grepl("soft|annot",.)

158

159

#Working with the wordy part of the document

159

#Working with the wordy part of the document

160

alzword <- alz %>%

160

alzword <- alz %>%

161

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

161

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

162

filter(grepl("!Sample",X1))%>%

162

filter(grepl("!Sample",X1))%>%

163

filter(!grepl("!Sample_contact",X1))

163

filter(!grepl("!Sample_contact",X1))

164

165

##Changing row names and column names:

165

##Changing row names and column names:

166

ALZWORD <- t(alzword)

166

ALZWORD <- t(alzword)

167

rownames(ALZWORD)=NULL

167

rownames(ALZWORD)=NULL

168

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

168

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

169

ALZWORD <- chngrownm(ALZWORD)[-1,]

169

ALZWORD <- chngrownm(ALZWORD)[-1,]

170

ALZWORD <- ALZWORD%>%

170

ALZWORD <- ALZWORD%>%

171

as.data.frame()%>%

171

as.data.frame()%>%

172

dplyr::select(-starts_with("col"))

172

dplyr::select(-starts_with("col"))

173

174

##Reorganizing information within the columns

174

##Reorganizing information within the columns

175

ALZWORDF <- cinfo(ALZWORD)

175

ALZWORDF <- cinfo(ALZWORD)

176

177

178

#Working with Actual Data part of file

178

#Working with Actual Data part of file

179

alzdat <- alz %>%

179

alzdat <- alz %>%

180

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

180

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

181

ALZDAT <- t(alzdat[,-1])

181

ALZDAT <- t(alzdat[,-1])

182

rownames(ALZDAT)=NULL

182

rownames(ALZDAT)=NULL

183

184

##Is there a clean version of the GPL file available?

184

##Is there a clean version of the GPL file available?

185

gplnum <- strsplit(genena,"[\\|/]") %>%

185

gplnum <- strsplit(genena,"[\\|/]") %>%

186

.[[1]] %>%

186

.[[1]] %>%

187

.[length(.)] %>%

187

.[length(.)] %>%

188

gsub("\\D","",.)

188

gsub("\\D","",.)

189

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

189

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

190

if(clfileex >= 1){

190

if(clfileex >= 1){

191

#use the clean version

191

#use the clean version

192

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

192

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

193

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

193

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

194

195

}

195

}

196

if(clfileex == 0){

196

if(clfileex == 0){

197

##Lets Create a clean version

197

##Lets Create a clean version

198

199

##Gene ID to Gene Name

199

##Gene ID to Gene Name

200

###geneIDNam <- genena %>%

200

###geneIDNam <- genena %>%

201

### read_delim(delim="\t",comment = "#")%>%

201

### read_delim(delim="\t",comment = "#")%>%

202

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

202

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

203

###problems with the above for soft files

203

###problems with the above for soft files

204

if(soft == TRUE){

204

if(soft == TRUE){

205

#gplnum <- strsplit(genena,"[\\|/]") %>%

205

#gplnum <- strsplit(genena,"[\\|/]") %>%

206

# .[[1]] %>%

206

# .[[1]] %>%

207

# .[length(.)] %>%

207

# .[length(.)] %>%

208

# gsub("\\D","",.)

208

# gsub("\\D","",.)

209

#Check to see if there is already a file containing information on soft files

209

#Check to see if there is already a file containing information on soft files

210

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

210

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

211

if(fileex == 1){

211

if(fileex == 1){

212

#Check to see if this GPL soft file has been used before

212

#Check to see if this GPL soft file has been used before

213

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

213

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

214

.$GPL_FILE_NUM%>%

214

.$GPL_FILE_NUM%>%

215

grepl(gplnum,.) %>%

215

grepl(gplnum,.) %>%

216

sum()

216

sum()

217

if(IDF == 1){

217

if(IDF == 1){

218

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

218

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

219

.$GPL_FILE_NUM%>%

219

.$GPL_FILE_NUM%>%

220

grep(gplnum,.)

220

grep(gplnum,.)

221

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

221

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

222

.$LOC_ID %>%

222

.$LOC_ID %>%

223

.[IDLOCAL]

223

.[IDLOCAL]

224

geneIDNam <- genena %>%

224

geneIDNam <- genena %>%

225

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

225

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

226

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

226

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

227

}

227

}

228

if(IDF == 0){

228

if(IDF == 0){

229

#No information on this particular GPL file

229

#No information on this particular GPL file

230

idLOCGPL <- genena %>%

230

idLOCGPL <- genena %>%

231

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

231

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

232

t(.) %>%

232

t(.) %>%

233

.[1,] %>%

233

.[1,] %>%

234

grep("^ID\\s*$",.) %>%

234

grep("^ID\\s*$",.) %>%

235

-1

235

-1

236

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

236

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

237

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

237

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

238

geneIDNam <- genena %>%

238

geneIDNam <- genena %>%

239

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

239

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

240

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

240

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

241

}

241

}

242

}

242

}

243

if(fileex == 0){

243

if(fileex == 0){

244

#We must create a file that we can access for later use

244

#We must create a file that we can access for later use

245

idLOCGPL <- genena %>%

245

idLOCGPL <- genena %>%

246

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

246

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

247

t(.) %>%

247

t(.) %>%

248

.[1,] %>%

248

.[1,] %>%

249

grep("^ID\\s*$",.) %>%

249

grep("^ID\\s*$",.) %>%

250

-1

250

-1

251

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

251

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

252

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

252

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

253

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

253

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

254

geneIDNam <- genena %>%

254

geneIDNam <- genena %>%

255

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

255

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

256

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

256

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

257

}

257

}

258

}

258

}

259

if(soft == FALSE){

259

if(soft == FALSE){

260

geneIDNam <- genena %>%

260

geneIDNam <- genena %>%

261

read_delim(delim="\t",comment = "#")%>%

261

read_delim(delim="\t",comment = "#")%>%

262

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

262

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

263

}

263

}

264

265

##Labeling the gene IDs without names

265

##Labeling the gene IDs without names

266

geneIDNam <- NAFIXING(geneIDNam)

266

geneIDNam <- NAFIXING(geneIDNam)

267

268

##remove the whitespace

268

##remove the whitespace

269

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

269

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

270

271

##Here is the clean version

271

##Here is the clean version

272

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

272

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

273

}

273

}

274

275

276

277

##Changing the gene ID to gene name

277

##Changing the gene ID to gene name

278

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

278

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

279

colnames(ALZDAT) = ALZDAT1[1,]

279

colnames(ALZDAT) = ALZDAT1[1,]

280

281

282

##Adjusting the column names aka the gene names

282

##Adjusting the column names aka the gene names

283

colnames(ALZDAT) <- gcnames(ALZDAT)

283

colnames(ALZDAT) <- gcnames(ALZDAT)

284

285

286

#Full Data

286

#Full Data

287

Fullalzdw <- ALZDAT %>%

287

Fullalzdw <- ALZDAT %>%

288

as.data.frame() %>%

288

as.data.frame() %>%

289

cbind(ALZWORDF,.)

289

cbind(ALZWORDF,.)

290

291

292

#nfna <- strsplit(alz,"[\\|/]") %>%

292

#nfna <- strsplit(alz,"[\\|/]") %>%

293

# .[[1]] %>%

293

# .[[1]] %>%

294

# .[length(.)] %>%

294

# .[length(.)] %>%

295

# gsub("\\D","",.) %>%

295

# gsub("\\D","",.) %>%

296

# c("GSE",.,"after.txt") %>%

296

# c("GSE",.,"after.txt") %>%

297

# paste(collapse = "")

297

# paste(collapse = "")

298

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

298

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

299

300

#Perfect for excel viewing

300

#Perfect for excel viewing

301

nfnaex <- strsplit(alz,"[\\]") %>%

301

nfnaex <- strsplit(alz,"[\\]") %>%

302

.[[1]] %>%

302

.[[1]] %>%

303

.[length(.)] %>%

303

.[length(.)] %>%

304

gsub("\\D","",.) %>%

304

gsub("\\D","",.) %>%

305

c("GSE",.,"aftexcel.txt") %>%

305

c("GSE",.,"aftexcel.txt") %>%

306

paste(collapse = "")

306

paste(collapse = "")

307

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

307

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

308

309

310

311

312

313

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Added Normal to glossary of terms