Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(pryr)

2

library(pryr)

3

library(MASS)

3

library(MASS)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

x <- 1

91

x <- 1

92

for(i in 1:row){

92

for(i in 1:row){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){

93

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE){

94

GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])

94

GIDNAM[i,2] <- gsub(".*",paste0(NA,x),GIDNAM[i,2])

95

x <- x + 1

95

x <- x + 1

96

}

96

}

97

i <- i + 1

97

i <- i + 1

98

}

98

}

99

GIDNAM

99

GIDNAM

100

}

100

}

101

102

#4#Function for changing the gene ID to gene name

102

#4#Function for changing the gene ID to gene name

103

cgeneID <- function(GeneName,DATA){

103

cgeneID <- function(GeneName,DATA){

104

colGene <- dim(GeneName)[2]

104

colGene <- dim(GeneName)[2]

105

j <- 1

105

j <- 1

106

for(j in 1:colGene){

106

for(j in 1:colGene){

107

chngsreq <- grep(GeneName[1,j],DATA[1,])

107

chngsreq <- grep(paste0("^",GeneName[1,j]),DATA[1,])

108

DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

108

DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

109

j = j+1

109

j = j+1

110

}

110

}

111

DATA

111

DATA

112

}

112

}

113

114

#5#Function for adjusting the gene names

114

#5#Function for adjusting the gene names

115

gcnames <- function(DiData,usecol=1){

115

gcnames <- function(DiData,usecol=1){

116

nuruns <- dim(DiData)[2]

116

nuruns <- dim(DiData)[2]

117

i = 1

117

i = 1

118

nwnam <- rep("0",length.out=nuruns)

118

nwnam <- rep("0",length.out=nuruns)

119

for(i in 1:nuruns){

119

for(i in 1:nuruns){

120

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

120

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

121

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

121

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

122

} else{

122

} else{

123

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

123

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

124

}

124

}

125

126

}

126

}

127

nwnam

127

nwnam

128

129

}

129

}

130

131

132

133

#The Rest of this code will be used every time you want to change a data set

133

#The Rest of this code will be used every time you want to change a data set

134

135

#Getting the series matrix file

135

#Getting the series matrix file

136

print("Choose the series matrix file that you want to Analyze")

136

print("Choose the series matrix file that you want to Analyze")

137

alz <- file.choose()

137

alz <- file.choose()

138

139

#Getting the GPL file

139

#Getting the GPL file

140

print("Choose the GPL file that correlates with the above series matrix file")

140

print("Choose the GPL file that correlates with the above series matrix file")

141

genena <- file.choose()

141

genena <- file.choose()

142

143

144

#Set working directory based on the directory of the series matrix file Currently only works for windows

144

#Set working directory based on the directory of the series matrix file Currently only works for windows

145

##strsplit(alz,"[\\]") %>%

145

##strsplit(alz,"[\\]") %>%

146

## .[[1]] %>%

146

## .[[1]] %>%

147

## .[-length(.)] %>%

147

## .[-length(.)] %>%

148

## paste(.,collapse="/") %>%

148

## paste(.,collapse="/") %>%

149

## setwd()

149

## setwd()

150

151

#Find out if it is a soft GPL file or not

151

#Find out if it is a soft GPL file or not

152

soft <- strsplit(genena,"[\\|/]") %>%

152

soft <- strsplit(genena,"[\\|/]") %>%

153

.[[1]] %>%

153

.[[1]] %>%

154

.[length(.)] %>%

154

.[length(.)] %>%

155

grepl("soft",.)

155

grepl("soft",.)

156

157

#Working with the wordy part of the document

157

#Working with the wordy part of the document

158

alzword <- alz %>%

158

alzword <- alz %>%

159

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

159

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

160

filter(grepl("!Sample",X1))%>%

160

filter(grepl("!Sample",X1))%>%

161

filter(!grepl("!Sample_contact",X1))

161

filter(!grepl("!Sample_contact",X1))

162

163

##Changing row names and column names:

163

##Changing row names and column names:

164

ALZWORD <- t(alzword)

164

ALZWORD <- t(alzword)

165

rownames(ALZWORD)=NULL

165

rownames(ALZWORD)=NULL

166

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

166

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

167

ALZWORD <- chngrownm(ALZWORD)[-1,]

167

ALZWORD <- chngrownm(ALZWORD)[-1,]

168

ALZWORD <- ALZWORD%>%

168

ALZWORD <- ALZWORD%>%

169

as.data.frame()%>%

169

as.data.frame()%>%

170

dplyr::select(-starts_with("col"))

170

dplyr::select(-starts_with("col"))

171

172

##Reorganizing information within the columns

172

##Reorganizing information within the columns

173

ALZWORDF <- cinfo(ALZWORD)

173

ALZWORDF <- cinfo(ALZWORD)

174

175

176

#Working with Actual Data part of file

176

#Working with Actual Data part of file

177

alzdat <- alz %>%

177

alzdat <- alz %>%

178

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

178

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

179

ALZDAT <- t(alzdat[,-1])

179

ALZDAT <- t(alzdat[,-1])

180

rownames(ALZDAT)=NULL

180

rownames(ALZDAT)=NULL

181

182

183

##Gene ID to Gene Name

183

##Gene ID to Gene Name

184

###geneIDNam <- genena %>%

184

###geneIDNam <- genena %>%

185

### read_delim(delim="\t",comment = "#")%>%

185

### read_delim(delim="\t",comment = "#")%>%

186

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

186

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

187

###problems with the above for soft files

187

###problems with the above for soft files

188

if(soft == TRUE){

188

if(soft == TRUE){

189

gplnum <- strsplit(genena,"[\\|/]") %>%

189

gplnum <- strsplit(genena,"[\\|/]") %>%

190

.[[1]] %>%

190

.[[1]] %>%

191

.[length(.)] %>%

191

.[length(.)] %>%

192

gsub("\\D","",.)

192

gsub("\\D","",.)

193

#Check to see if there is already a file containing information on soft files

193

#Check to see if there is already a file containing information on soft files

194

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

194

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

195

if(fileex == 1){

195

if(fileex == 1){

196

#Check to see if this GPL soft file has been used before

196

#Check to see if this GPL soft file has been used before

197

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

197

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

198

.$GPL_FILE_NUM%>%

198

.$GPL_FILE_NUM%>%

199

grepl(gplnum,.) %>%

199

grepl(gplnum,.) %>%

200

sum()

200

sum()

201

if(IDF == 1){

201

if(IDF == 1){

202

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

202

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

203

.$GPL_FILE_NUM%>%

203

.$GPL_FILE_NUM%>%

204

grep(gplnum,.)

204

grep(gplnum,.)

205

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

205

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

206

.$LOC_ID %>%

206

.$LOC_ID %>%

207

.[IDLOCAL]

207

.[IDLOCAL]

208

geneIDNam <- genena %>%

208

geneIDNam <- genena %>%

209

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

209

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

210

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

210

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

211

}

211

}

212

if(IDF == 0){

212

if(IDF == 0){

213

#No information on this particular GPL file

213

#No information on this particular GPL file

214

idLOCGPL <- genena %>%

214

idLOCGPL <- genena %>%

215

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

215

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

216

t(.) %>%

216

t(.) %>%

217

grep("^\\D",.) %>%

217

grep("^\\D",.) %>%

218

length()-1

218

length()-1

219

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

219

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

220

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

220

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

221

geneIDNam <- genena %>%

221

geneIDNam <- genena %>%

222

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

222

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

223

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

223

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

224

}

224

}

225

}

225

}

226

if(fileex == 0){

226

if(fileex == 0){

227

#We must create a file that we can access for later use

227

#We must create a file that we can access for later use

228

idLOCGPL <- genena %>%

228

idLOCGPL <- genena %>%

229

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

229

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

230

t(.) %>%

230

t(.) %>%

231

grep("^\\D",.) %>%

231

grep("^\\D",.) %>%

232

length()-1

232

length()-1

233

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

233

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

234

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

234

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

235

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

235

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

236

geneIDNam <- genena %>%

236

geneIDNam <- genena %>%

237

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

237

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

238

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

238

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

239

}

239

}

240

}

240

}

241

if(soft == FALSE){

241

if(soft == FALSE){

242

geneIDNam <- genena %>%

242

geneIDNam <- genena %>%

243

read_delim(delim="\t",comment = "#")%>%

243

read_delim(delim="\t",comment = "#")%>%

244

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

244

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

245

}

245

}

246

247

##Labeling the gene IDs without names

247

##Labeling the gene IDs without names

248

geneIDNam <- NAFIXING(geneIDNam)

248

geneIDNam <- NAFIXING(geneIDNam)

249

250

##remove the whitespace

250

##remove the whitespace

251

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

251

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

252

253

##Changing the gene ID to gene name

253

##Changing the gene ID to gene name

254

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

254

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

255

colnames(ALZDAT) = ALZDAT1[1,]

255

colnames(ALZDAT) = ALZDAT1[1,]

256

257

258

##Adjusting the column names aka the gene names

258

##Adjusting the column names aka the gene names

259

colnames(ALZDAT) <- gcnames(ALZDAT)

259

colnames(ALZDAT) <- gcnames(ALZDAT)

260

261

262

#Full Data

262

#Full Data

263

Fullalzdw <- ALZDAT %>%

263

Fullalzdw <- ALZDAT %>%

264

as.data.frame() %>%

264

as.data.frame() %>%

265

cbind(ALZWORDF,.)

265

cbind(ALZWORDF,.)

266

267

268

nfna <- strsplit(alz,"[\\]") %>%

268

nfna <- strsplit(alz,"[\\]") %>%

269

.[[1]] %>%

269

.[[1]] %>%

270

.[length(.)] %>%

270

.[length(.)] %>%

271

gsub("\\D","",.) %>%

271

gsub("\\D","",.) %>%

272

c("GSE",.,"after.txt") %>%

272

c("GSE",.,"after.txt") %>%

273

paste(collapse = "")

273

paste(collapse = "")

274

write.matrix(Fullalzdw,file = nfna,sep = "\t")

274

write.matrix(Fullalzdw,file = nfna,sep = "\t")

275

#Perfect for excel viewing

275

#Perfect for excel viewing

276

nfnaex <- strsplit(alz,"[\\]") %>%

276

nfnaex <- strsplit(alz,"[\\]") %>%

277

.[[1]] %>%

277

.[[1]] %>%

278

.[length(.)] %>%

278

.[length(.)] %>%

279

gsub("\\D","",.) %>%

279

gsub("\\D","",.) %>%

280

c("GSE",.,"aftexcel.txt") %>%

280

c("GSE",.,"aftexcel.txt") %>%

281

paste(collapse = "")

281

paste(collapse = "")

282

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

282

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

283

284

285

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Found problem with the function cgeneID and fixed it.