Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(pryr)

2

library(pryr)

3

library(MASS)

3

library(MASS)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

for(i in 1:row){

91

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

93

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

94

}

95

i <- i + 1

95

i <- i + 1

96

}

96

}

97

GIDNAM

97

GIDNAM

98

}

98

}

99

100

##4#Function for changing the gene ID to gene name

100

##4#Function for changing the gene ID to gene name

101

##cgeneID <- function(GeneName,DATA){

101

##cgeneID <- function(GeneName,DATA){

102

## colGene <- dim(GeneName)[2]

102

## colGene <- dim(GeneName)[2]

103

## j <- 1

103

## j <- 1

104

## for(j in 1:colGene){

104

## for(j in 1:colGene){

105

## chngsreq <- grep(GeneName[1,j],DATA[1,])

105

## chngsreq <- grep(GeneName[1,j],DATA[1,])

106

## if(sum(chngsreq) > 0){

106

## if(sum(chngsreq) > 0){

107

## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

107

## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

108

## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

108

## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

109

## }

109

## }

110

## j = j+1

110

## j = j+1

111

## }

111

## }

112

## DATA

112

## DATA

113

##}

113

##}

114

#4#Function for changing the gene ID to gene name

114

#4#Function for changing the gene ID to gene name

115

cgeneID <- function(GeneName,DATA){

115

cgeneID <- function(GeneName,DATA){

116

colGene <- dim(GeneName)[2]

116

colGene <- dim(GeneName)[2]

117

j <- 1

117

j <- 1

118

for(j in 1:colGene){

118

for(j in 1:colGene){

119

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

119

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

120

if(is.na(sum(chngsreq))==FALSE){

120

if(is.na(sum(chngsreq))==FALSE){

121

if(sum(chngsreq) > 0){

121

if(sum(chngsreq) > 0){

122

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

122

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

123

}

123

}

124

}

124

}

125

#if(sum(chngsreq) > 0){

125

#if(sum(chngsreq) > 0){

126

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

126

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

127

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

128

#}

128

#}

129

j = j+1

129

j = j+1

130

}

130

}

131

DATA

131

DATA

132

}

132

}

133

134

#5#Function for adjusting the gene names

134

#5#Function for adjusting the gene names

135

gcnames <- function(DiData,usecol=1){

135

gcnames <- function(DiData,usecol=1){

136

nuruns <- dim(DiData)[2]

136

nuruns <- dim(DiData)[2]

137

i = 1

137

i = 1

138

nwnam <- rep("0",length.out=nuruns)

138

nwnam <- rep("0",length.out=nuruns)

139

for(i in 1:nuruns){

139

for(i in 1:nuruns){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

141

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

141

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

142

} else{

142

} else{

143

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

143

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

144

}

144

}

145

146

}

146

}

147

nwnam

147

nwnam

148

149

}

149

}

150

151

152

153

#The Rest of this code will be used every time you want to change a data set

153

#The Rest of this code will be used every time you want to change a data set

154

155

#Getting the series matrix file

155

#Getting the series matrix file

156

print("Choose the series matrix file that you want to Analyze")

156

print("Choose the series matrix file that you want to Analyze")

157

alz <- file.choose()

157

alz <- file.choose()

158

159

#Getting the GPL file

159

#Getting the GPL file

160

print("Choose the GPL file that correlates with the above series matrix file")

160

print("Choose the GPL file that correlates with the above series matrix file")

161

genena <- file.choose()

161

genena <- file.choose()

162

163

164

#Set working directory based on the directory of the series matrix file Currently only works for windows

164

#Set working directory based on the directory of the series matrix file Currently only works for windows

165

##strsplit(alz,"[\\]") %>%

165

##strsplit(alz,"[\\]") %>%

166

## .[[1]] %>%

166

## .[[1]] %>%

167

## .[-length(.)] %>%

167

## .[-length(.)] %>%

168

## paste(.,collapse="/") %>%

168

## paste(.,collapse="/") %>%

169

## setwd()

169

## setwd()

170

171

#Find out if it is a soft GPL file or not

171

#Find out if it is a soft GPL file or not

172

soft <- strsplit(genena,"[\\|/]") %>%

172

soft <- strsplit(genena,"[\\|/]") %>%

173

.[[1]] %>%

173

.[[1]] %>%

174

.[length(.)] %>%

174

.[length(.)] %>%

175

grepl("soft",.)

175

grepl("soft",.)

176

177

#Working with the wordy part of the document

177

#Working with the wordy part of the document

178

alzword <- alz %>%

178

alzword <- alz %>%

179

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

179

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

180

filter(grepl("!Sample",X1))%>%

180

filter(grepl("!Sample",X1))%>%

181

filter(!grepl("!Sample_contact",X1))

181

filter(!grepl("!Sample_contact",X1))

182

183

##Changing row names and column names:

183

##Changing row names and column names:

184

ALZWORD <- t(alzword)

184

ALZWORD <- t(alzword)

185

rownames(ALZWORD)=NULL

185

rownames(ALZWORD)=NULL

186

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

186

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

187

ALZWORD <- chngrownm(ALZWORD)[-1,]

187

ALZWORD <- chngrownm(ALZWORD)[-1,]

188

ALZWORD <- ALZWORD%>%

188

ALZWORD <- ALZWORD%>%

189

as.data.frame()%>%

189

as.data.frame()%>%

190

dplyr::select(-starts_with("col"))

190

dplyr::select(-starts_with("col"))

191

192

##Reorganizing information within the columns

192

##Reorganizing information within the columns

193

ALZWORDF <- cinfo(ALZWORD)

193

ALZWORDF <- cinfo(ALZWORD)

194

195

196

#Working with Actual Data part of file

196

#Working with Actual Data part of file

197

alzdat <- alz %>%

197

alzdat <- alz %>%

198

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

198

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

199

ALZDAT <- t(alzdat[,-1])

199

ALZDAT <- t(alzdat[,-1])

200

rownames(ALZDAT)=NULL

200

rownames(ALZDAT)=NULL

201

202

##Is there a clean version of the GPL file available?

202

##Is there a clean version of the GPL file available?

203

gplnum <- strsplit(genena,"[\\|/]") %>%

203

gplnum <- strsplit(genena,"[\\|/]") %>%

204

.[[1]] %>%

204

.[[1]] %>%

205

.[length(.)] %>%

205

.[length(.)] %>%

206

gsub("\\D","",.)

206

gsub("\\D","",.)

207

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

207

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

208

if(clfileex >= 1){

208

if(clfileex >= 1){

209

#use the clean version

209

#use the clean version

210

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

210

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

211

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

211

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

212

213

}

213

}

214

if(clfileex == 0){

214

if(clfileex == 0){

215

##Lets Create a clean version

215

##Lets Create a clean version

216

217

##Gene ID to Gene Name

217

##Gene ID to Gene Name

218

###geneIDNam <- genena %>%

218

###geneIDNam <- genena %>%

219

### read_delim(delim="\t",comment = "#")%>%

219

### read_delim(delim="\t",comment = "#")%>%

220

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

220

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

221

###problems with the above for soft files

221

###problems with the above for soft files

222

if(soft == TRUE){

222

if(soft == TRUE){

223

#gplnum <- strsplit(genena,"[\\|/]") %>%

223

#gplnum <- strsplit(genena,"[\\|/]") %>%

224

# .[[1]] %>%

224

# .[[1]] %>%

225

# .[length(.)] %>%

225

# .[length(.)] %>%

226

# gsub("\\D","",.)

226

# gsub("\\D","",.)

227

#Check to see if there is already a file containing information on soft files

227

#Check to see if there is already a file containing information on soft files

228

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

228

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

229

if(fileex == 1){

229

if(fileex == 1){

230

#Check to see if this GPL soft file has been used before

230

#Check to see if this GPL soft file has been used before

231

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

231

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

232

.$GPL_FILE_NUM%>%

232

.$GPL_FILE_NUM%>%

233

grepl(gplnum,.) %>%

233

grepl(gplnum,.) %>%

234

sum()

234

sum()

235

if(IDF == 1){

235

if(IDF == 1){

236

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

236

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

237

.$GPL_FILE_NUM%>%

237

.$GPL_FILE_NUM%>%

238

grep(gplnum,.)

238

grep(gplnum,.)

239

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

239

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

240

.$LOC_ID %>%

240

.$LOC_ID %>%

241

.[IDLOCAL]

241

.[IDLOCAL]

242

geneIDNam <- genena %>%

242

geneIDNam <- genena %>%

243

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

243

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

244

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

244

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

245

}

245

}

246

if(IDF == 0){

246

if(IDF == 0){

247

#No information on this particular GPL file

247

#No information on this particular GPL file

248

idLOCGPL <- genena %>%

248

idLOCGPL <- genena %>%

249

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

249

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

250

t(.) %>%

250

t(.) %>%

251

.[1,] %>%

251

grep("^ID\\s*$",.) %>%

252

grep("^ID\\s*$",.) %>%

252

-1

253

-1

253

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

254

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

254

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

255

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

255

geneIDNam <- genena %>%

256

geneIDNam <- genena %>%

256

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

257

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

257

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

258

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

258

}

259

}

259

}

260

}

260

if(fileex == 0){

261

if(fileex == 0){

261

#We must create a file that we can access for later use

262

#We must create a file that we can access for later use

262

idLOCGPL <- genena %>%

263

idLOCGPL <- genena %>%

263

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

264

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

264

t(.) %>%

265

t(.) %>%

266

.[1,] %>%

265

grep("^ID\\s*$",.) %>%

267

grep("^ID\\s*$",.) %>%

266

-1

268

-1

267

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

269

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

268

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

270

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

269

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

271

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

270

geneIDNam <- genena %>%

272

geneIDNam <- genena %>%

271

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

272

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

274

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

273

}

275

}

274

}

276

}

275

if(soft == FALSE){

277

if(soft == FALSE){

276

geneIDNam <- genena %>%

278

geneIDNam <- genena %>%

277

read_delim(delim="\t",comment = "#")%>%

279

read_delim(delim="\t",comment = "#")%>%

278

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

280

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))

279

}

281

}

280

282

281

##Labeling the gene IDs without names

283

##Labeling the gene IDs without names

282

geneIDNam <- NAFIXING(geneIDNam)

284

geneIDNam <- NAFIXING(geneIDNam)

283

285

284

##remove the whitespace

286

##remove the whitespace

285

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

287

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

286

288

287

##Here is the clean version

289

##Here is the clean version

288

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

290

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

289

}

291

}

290

292

291

293

292

294

293

##Changing the gene ID to gene name

295

##Changing the gene ID to gene name

294

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

296

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

295

colnames(ALZDAT) = ALZDAT1[1,]

297

colnames(ALZDAT) = ALZDAT1[1,]

296

298

297

299

298

##Adjusting the column names aka the gene names

300

##Adjusting the column names aka the gene names

299

colnames(ALZDAT) <- gcnames(ALZDAT)

301

colnames(ALZDAT) <- gcnames(ALZDAT)

300

302

301

303

302

#Full Data

304

#Full Data

303

Fullalzdw <- ALZDAT %>%

305

Fullalzdw <- ALZDAT %>%

304

as.data.frame() %>%

306

as.data.frame() %>%

305

cbind(ALZWORDF,.)

307

cbind(ALZWORDF,.)

306

308

307

309

308

#nfna <- strsplit(alz,"[\\|/]") %>%

310

#nfna <- strsplit(alz,"[\\|/]") %>%

309

# .[[1]] %>%

311

# .[[1]] %>%

310

# .[length(.)] %>%

312

# .[length(.)] %>%

311

# gsub("\\D","",.) %>%

313

# gsub("\\D","",.) %>%

312

# c("GSE",.,"after.txt") %>%

314

# c("GSE",.,"after.txt") %>%

313

# paste(collapse = "")

315

# paste(collapse = "")

314

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

316

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

315

317

316

#Perfect for excel viewing

318

#Perfect for excel viewing

317

nfnaex <- strsplit(alz,"[\\]") %>%

319

nfnaex <- strsplit(alz,"[\\]") %>%

318

.[[1]] %>%

320

.[[1]] %>%

319

.[length(.)] %>%

321

.[length(.)] %>%

320

gsub("\\D","",.) %>%

322

gsub("\\D","",.) %>%

321

c("GSE",.,"aftexcel.txt") %>%

323

c("GSE",.,"aftexcel.txt") %>%

322

paste(collapse = "")

324

paste(collapse = "")

323

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

325

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

324

326

325

327

326

328

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

fixed issue with idLOCGPL

 #Libraries required to run the code
 library(pryr)
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1#Function for handling the changing of row names and column names
 chngrownm <- function(mat){
 	row <- dim(mat)[1]
 	col <- dim(mat)[2]
 	j <- 1
 	x <- 1
 	p <- 1
 	a <- 1
 	b <- 1
 	g <- 1
 	for(j in 1:col){
 		if("!Sample_source_name_ch1"==mat[1,j]){
 			colnames(mat)[j] <- "Brain_Region"
 		}
 		if("!Sample_title" == mat[1,j]){
 			colnames(mat)[j] <- "Title"
 		}
 		if("!Sample_geo_accession" == mat[1,j]){
 			colnames(mat)[j] <- "ID_REF"
 		} else{
 			if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Sex",x)
 				x = x + 1
 			}
 			if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("PMI",p)
 				p = p + 1
 			}
 			if(grepl("age|Age|AGE",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Age",a)
 				a = a + 1
 			 }
 			if(grepl("braak|b&b",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Braak",b)
 				b = b + 1
 			}
 			if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){
 				colnames(mat)[j] <- paste0("Group",g)
 				g = g + 1
 			}
 		}
 		j = j + 1
 	}
 	mat
 }
 #2#Function for reorganizing information within the columns
 cinfo <- function(mat){
 	col <- dim(mat)[2]
 	j <-2
 	for(j in 2:col){
 		if(grepl("Group",colnames(mat)[j]) == TRUE){
 			mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])
 		}
 		if(grepl("Age",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("\\D","",mat[,j])%>%
 				as.integer()
 		}
 		if(grepl("Sex",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub(".+:\\s","",mat[,j])
 		}
 		if(grepl("PMI",colnames(mat)[j])==TRUE){
 			mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%
 				as.numeric()
 		}
 		if(grepl("Braak",colnames(mat)[j])==TRUE){
 			mat[,j]<-gsub(".+:\\s","",mat[,j])%>%
 				as.roman()%>%
 				as.integer()
 		}
 	j=j+1
 	}
 	mat
 }
 #3#Function for labeling the gene IDs without names
 NAFIXING <- function(GIDNAM){
 	row <- dim(GIDNAM)[1]
 	i <- 1
 	for(i in 1:row){
 		if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){
 			GIDNAM[i,2] <- GIDNAM[i,1]
 		}
 		i <- i + 1
 	}
 	GIDNAM
 }
 ##4#Function for changing the gene ID to gene name
 ##cgeneID <- function(GeneName,DATA){
 ##	colGene <- dim(GeneName)[2]
 ##	j <- 1
 ##	for(j in 1:colGene){
 ##		chngsreq <- grep(GeneName[1,j],DATA[1,])
 ##		if(sum(chngsreq) > 0){
 ##			#DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
 ##			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 ##		}
 ##		j = j+1
 ##	}
 ##	DATA
 ##}
 #4#Function for changing the gene ID to gene name
 cgeneID <- function(GeneName,DATA){
     colGene <- dim(GeneName)[2]
      j <- 1
      for(j in 1:colGene){
 	chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])
 	if(is.na(sum(chngsreq))==FALSE){
 		if(sum(chngsreq) > 0){
 			DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 		}
 	}
 		#if(sum(chngsreq) > 0){
 		##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])
 		#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])
 		#}
 	j = j+1
 	}
 	DATA
 }
 #5#Function for adjusting the gene names
 gcnames <- function(DiData,usecol=1){
 	nuruns <- dim(DiData)[2]
 	i = 1
 	nwnam <- rep("0",length.out=nuruns)
 	for(i in 1:nuruns){
 		if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){
 			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]
 		} else{
 			nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]
 		}
 	}
 	nwnam
 }
 #The Rest of this code will be used every time you want to change a data set
 #Getting the series matrix file
 print("Choose the series matrix file that you want to Analyze")
 alz <- file.choose()
 #Getting the GPL file
 print("Choose the GPL file that correlates with the above series matrix file")
 genena <- file.choose()
 #Set working directory based on the directory of the series matrix file Currently only works for windows
 ##strsplit(alz,"[\\]") %>%
 ##	.[[1]] %>%
 ##	.[-length(.)] %>%
 ##	paste(.,collapse="/") %>%
 ##	setwd()
 #Find out if it is a soft GPL file or not
 soft <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	grepl("soft",.)
 #Working with the wordy part of the document
 alzword <- alz %>%
 	read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%
 	filter(grepl("!Sample",X1))%>%
 	filter(!grepl("!Sample_contact",X1))
 ##Changing row names and column names:
 ALZWORD <- t(alzword)
 rownames(ALZWORD)=NULL
 colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)
 ALZWORD <- chngrownm(ALZWORD)[-1,]
 ALZWORD <- ALZWORD%>%
 	as.data.frame()%>%
 	dplyr::select(-starts_with("col"))
 ##Reorganizing information within the columns
 ALZWORDF <- cinfo(ALZWORD)
 #Working with Actual Data part of file
 alzdat <- alz %>%
 	read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)
 ALZDAT <- t(alzdat[,-1])
 rownames(ALZDAT)=NULL
 ##Is there a clean version of the GPL file available?
 gplnum <- strsplit(genena,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.)
 clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))
 if(clfileex >= 1){
 #use the clean version
 geneIDNam <-  paste0("Clean_GPL",gplnum,".txt") %>%
 	read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")
 }
 if(clfileex == 0){
 ##Lets Create a clean version
 ##Gene ID to Gene Name
 ###geneIDNam <- genena %>%
 ###	read_delim(delim="\t",comment = "#")%>%
 ###	dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))
 ###problems with the above for soft files
 	if(soft == TRUE){
 		#gplnum <- strsplit(genena,"[\\|/]") %>%
 		#	.[[1]] %>%
 		#	.[length(.)] %>%
 		#	gsub("\\D","",.)
 		#Check to see if there is already a file containing information on soft files
 		fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))
 		if(fileex == 1){
 			#Check to see if this GPL soft file has been used before
 			IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 				.$GPL_FILE_NUM%>%
 				grepl(gplnum,.) %>%
 				sum()
 			if(IDF == 1){
 				IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$GPL_FILE_NUM%>%
 					grep(gplnum,.)
 				idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%
 					.$LOC_ID %>%
 					.[IDLOCAL]
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))
 			}
 			if(IDF == 0){
 				#No information on this particular GPL file
 				idLOCGPL <- genena %>%
 					read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 					t(.) %>%
+					.[1,] %>%
 					grep("^ID\\s*$",.) %>%
 					-1
 				cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%
 					cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)
 				geneIDNam <- genena %>%
 					read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 					dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))
 			}
 		}
 		if(fileex == 0){
 			#We must create a file that we can access for later use
 			idLOCGPL <- genena %>%
 				read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%
 				t(.) %>%
+				.[1,] %>%
 				grep("^ID\\s*$",.) %>%
 				-1
 			Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))
 			colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")
 			write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)
 			geneIDNam <- genena %>%
 				read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%
 				dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))
 		}
 	}
 	if(soft == FALSE){
 		geneIDNam <- genena %>%
 		read_delim(delim="\t",comment = "#")%>%
 		dplyr::select(.,ID,grep("Symbol|^ORF\\s*$",colnames(.)))
 	}
 	##Labeling the gene IDs without names
 	geneIDNam <- NAFIXING(geneIDNam)
 	##remove the whitespace
 	geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))
 	##Here is the clean version
 	write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)
 }
 ##Changing the gene ID to gene name
 ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))
 colnames(ALZDAT) = ALZDAT1[1,]
 ##Adjusting the column names aka the gene names
 colnames(ALZDAT) <- gcnames(ALZDAT)
 #Full Data
 Fullalzdw <- ALZDAT %>%
 	as.data.frame() %>%
 	cbind(ALZWORDF,.)
 #nfna <- strsplit(alz,"[\\|/]") %>%
 #	.[[1]] %>%
 #	.[length(.)] %>%
 #	gsub("\\D","",.) %>%
 #	c("GSE",.,"after.txt") %>%
 #	paste(collapse = "")
 #write.matrix(Fullalzdw,file = nfna,sep = "\t")
 #Perfect for excel viewing
 nfnaex <- strsplit(alz,"[\\]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"aftexcel.txt") %>%
 	paste(collapse = "")
 write.table(t(Fullalzdw), file = nfnaex, sep = "\t")