Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(pryr)

2

library(pryr)

3

library(MASS)

3

library(MASS)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for labeling the gene IDs without names

87

#3#Function for labeling the gene IDs without names

88

NAFIXING <- function(GIDNAM){

88

NAFIXING <- function(GIDNAM){

89

row <- dim(GIDNAM)[1]

89

row <- dim(GIDNAM)[1]

90

i <- 1

90

i <- 1

91

for(i in 1:row){

91

for(i in 1:row){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

92

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

93

GIDNAM[i,2] <- GIDNAM[i,1]

93

GIDNAM[i,2] <- GIDNAM[i,1]

94

}

94

}

95

i <- i + 1

95

i <- i + 1

96

}

96

}

97

GIDNAM

97

GIDNAM

98

}

98

}

99

100

##4#Function for changing the gene ID to gene name

100

##4#Function for changing the gene ID to gene name

101

##cgeneID <- function(GeneName,DATA){

101

##cgeneID <- function(GeneName,DATA){

102

## colGene <- dim(GeneName)[2]

102

## colGene <- dim(GeneName)[2]

103

## j <- 1

103

## j <- 1

104

## for(j in 1:colGene){

104

## for(j in 1:colGene){

105

## chngsreq <- grep(GeneName[1,j],DATA[1,])

105

## chngsreq <- grep(GeneName[1,j],DATA[1,])

106

## if(sum(chngsreq) > 0){

106

## if(sum(chngsreq) > 0){

107

## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

107

## #DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

108

## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

108

## DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

109

## }

109

## }

110

## j = j+1

110

## j = j+1

111

## }

111

## }

112

## DATA

112

## DATA

113

##}

113

##}

114

#4#Function for changing the gene ID to gene name

114

#4#Function for changing the gene ID to gene name

115

cgeneID <- function(GeneName,DATA){

115

cgeneID <- function(GeneName,DATA){

116

colGene <- dim(GeneName)[2]

116

colGene <- dim(GeneName)[2]

117

j <- 1

117

j <- 1

118

for(j in 1:colGene){

118

for(j in 1:colGene){

119

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

119

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

120

if(is.na(sum(chngsreq))==FALSE){

120

if(is.na(sum(chngsreq))==FALSE){

121

if(sum(chngsreq) > 0){

121

if(sum(chngsreq) > 0){

122

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

122

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

123

}

123

}

124

}

124

}

125

#if(sum(chngsreq) > 0){

125

#if(sum(chngsreq) > 0){

126

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

126

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

127

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

127

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

128

#}

128

#}

129

j = j+1

129

j = j+1

130

}

130

}

131

DATA

131

DATA

132

}

132

}

133

134

#5#Function for adjusting the gene names

134

#5#Function for adjusting the gene names

135

gcnames <- function(DiData,usecol=1){

135

gcnames <- function(DiData,usecol=1){

136

nuruns <- dim(DiData)[2]

136

nuruns <- dim(DiData)[2]

137

i = 1

137

i = 1

138

nwnam <- rep("0",length.out=nuruns)

138

nwnam <- rep("0",length.out=nuruns)

139

for(i in 1:nuruns){

139

for(i in 1:nuruns){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

140

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

141

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

141

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

142

} else{

142

} else{

143

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

143

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

144

}

144

}

145

146

}

146

}

147

nwnam

147

nwnam

148

149

}

149

}

150

151

152

153

#The Rest of this code will be used every time you want to change a data set

153

#The Rest of this code will be used every time you want to change a data set

154

155

#Getting the series matrix file

155

#Getting the series matrix file

156

print("Choose the series matrix file that you want to Analyze")

156

print("Choose the series matrix file that you want to Analyze")

157

alz <- file.choose()

157

alz <- file.choose()

158

159

#Getting the GPL file

159

#Getting the GPL file

160

print("Choose the GPL file that correlates with the above series matrix file")

160

print("Choose the GPL file that correlates with the above series matrix file")

161

genena <- file.choose()

161

genena <- file.choose()

162

163

164

#Set working directory based on the directory of the series matrix file Currently only works for windows

164

#Set working directory based on the directory of the series matrix file Currently only works for windows

165

##strsplit(alz,"[\\]") %>%

165

##strsplit(alz,"[\\]") %>%

166

## .[[1]] %>%

166

## .[[1]] %>%

167

## .[-length(.)] %>%

167

## .[-length(.)] %>%

168

## paste(.,collapse="/") %>%

168

## paste(.,collapse="/") %>%

169

## setwd()

169

## setwd()

170

171

#Find out if it is a soft GPL file or not

171

#Find out if it is a soft GPL file or not

172

soft <- strsplit(genena,"[\\|/]") %>%

172

soft <- strsplit(genena,"[\\|/]") %>%

173

.[[1]] %>%

173

.[[1]] %>%

174

.[length(.)] %>%

174

.[length(.)] %>%

175

grepl("soft|annot",.)

175

grepl("soft|annot",.)

176

177

#Working with the wordy part of the document

177

#Working with the wordy part of the document

178

alzword <- alz %>%

178

alzword <- alz %>%

179

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

179

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

180

filter(grepl("!Sample",X1))%>%

180

filter(grepl("!Sample",X1))%>%

181

filter(!grepl("!Sample_contact",X1))

181

filter(!grepl("!Sample_contact",X1))

182

183

##Changing row names and column names:

183

##Changing row names and column names:

184

ALZWORD <- t(alzword)

184

ALZWORD <- t(alzword)

185

rownames(ALZWORD)=NULL

185

rownames(ALZWORD)=NULL

186

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

186

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

187

ALZWORD <- chngrownm(ALZWORD)[-1,]

187

ALZWORD <- chngrownm(ALZWORD)[-1,]

188

ALZWORD <- ALZWORD%>%

188

ALZWORD <- ALZWORD%>%

189

as.data.frame()%>%

189

as.data.frame()%>%

190

dplyr::select(-starts_with("col"))

190

dplyr::select(-starts_with("col"))

191

192

##Reorganizing information within the columns

192

##Reorganizing information within the columns

193

ALZWORDF <- cinfo(ALZWORD)

193

ALZWORDF <- cinfo(ALZWORD)

194

195

196

#Working with Actual Data part of file

196

#Working with Actual Data part of file

197

alzdat <- alz %>%

197

alzdat <- alz %>%

198

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

198

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

199

ALZDAT <- t(alzdat[,-1])

199

ALZDAT <- t(alzdat[,-1])

200

rownames(ALZDAT)=NULL

200

rownames(ALZDAT)=NULL

201

202

##Is there a clean version of the GPL file available?

202

##Is there a clean version of the GPL file available?

203

gplnum <- strsplit(genena,"[\\|/]") %>%

203

gplnum <- strsplit(genena,"[\\|/]") %>%

204

.[[1]] %>%

204

.[[1]] %>%

205

.[length(.)] %>%

205

.[length(.)] %>%

206

gsub("\\D","",.)

206

gsub("\\D","",.)

207

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

207

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

208

if(clfileex >= 1){

208

if(clfileex >= 1){

209

#use the clean version

209

#use the clean version

210

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

210

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

211

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

211

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

212

213

}

213

}

214

if(clfileex == 0){

214

if(clfileex == 0){

215

##Lets Create a clean version

215

##Lets Create a clean version

216

217

##Gene ID to Gene Name

217

##Gene ID to Gene Name

218

###geneIDNam <- genena %>%

218

###geneIDNam <- genena %>%

219

### read_delim(delim="\t",comment = "#")%>%

219

### read_delim(delim="\t",comment = "#")%>%

220

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

220

### dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

221

###problems with the above for soft files

221

###problems with the above for soft files

222

if(soft == TRUE){

222

if(soft == TRUE){

223

#gplnum <- strsplit(genena,"[\\|/]") %>%

223

#gplnum <- strsplit(genena,"[\\|/]") %>%

224

# .[[1]] %>%

224

# .[[1]] %>%

225

# .[length(.)] %>%

225

# .[length(.)] %>%

226

# gsub("\\D","",.)

226

# gsub("\\D","",.)

227

#Check to see if there is already a file containing information on soft files

227

#Check to see if there is already a file containing information on soft files

228

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

228

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

229

if(fileex == 1){

229

if(fileex == 1){

230

#Check to see if this GPL soft file has been used before

230

#Check to see if this GPL soft file has been used before

231

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

231

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

232

.$GPL_FILE_NUM%>%

232

.$GPL_FILE_NUM%>%

233

grepl(gplnum,.) %>%

233

grepl(gplnum,.) %>%

234

sum()

234

sum()

235

if(IDF == 1){

235

if(IDF == 1){

236

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

236

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

237

.$GPL_FILE_NUM%>%

237

.$GPL_FILE_NUM%>%

238

grep(gplnum,.)

238

grep(gplnum,.)

239

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

239

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

240

.$LOC_ID %>%

240

.$LOC_ID %>%

241

.[IDLOCAL]

241

.[IDLOCAL]

242

geneIDNam <- genena %>%

242

geneIDNam <- genena %>%

243

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

243

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

244

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

244

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

245

}

245

}

246

if(IDF == 0){

246

if(IDF == 0){

247

#No information on this particular GPL file

247

#No information on this particular GPL file

248

idLOCGPL <- genena %>%

248

idLOCGPL <- genena %>%

249

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

249

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

250

t(.) %>%

250

t(.) %>%

251

.[1,] %>%

251

.[1,] %>%

252

grep("^ID\\s*$",.) %>%

252

grep("^ID\\s*$",.) %>%

253

-1

253

-1

254

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

254

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

255

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

255

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

256

geneIDNam <- genena %>%

256

geneIDNam <- genena %>%

257

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

257

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

258

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

258

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

259

}

259

}

260

}

260

}

261

if(fileex == 0){

261

if(fileex == 0){

262

#We must create a file that we can access for later use

262

#We must create a file that we can access for later use

263

idLOCGPL <- genena %>%

263

idLOCGPL <- genena %>%

264

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

264

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

265

t(.) %>%

265

t(.) %>%

266

.[1,] %>%

266

.[1,] %>%

267

grep("^ID\\s*$",.) %>%

267

grep("^ID\\s*$",.) %>%

268

-1

268

-1

269

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

269

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

270

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

270

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

271

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

271

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

272

geneIDNam <- genena %>%

272

geneIDNam <- genena %>%

273

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

273

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

274

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

274

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

275

}

275

}

276

}

276

}

277

if(soft == FALSE){

277

if(soft == FALSE){

278

geneIDNam <- genena %>%

278

geneIDNam <- genena %>%

279

read_delim(delim="\t",comment = "#")%>%

279

read_delim(delim="\t",comment = "#")%>%

280

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

280

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

281

}

281

}

282

283

##Labeling the gene IDs without names

283

##Labeling the gene IDs without names

284

geneIDNam <- NAFIXING(geneIDNam)

284

geneIDNam <- NAFIXING(geneIDNam)

285

286

##remove the whitespace

286

##remove the whitespace

287

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

287

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

288

289

##Here is the clean version

289

##Here is the clean version

290

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

290

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

291

}

291

}

292

293

294

295

##Changing the gene ID to gene name

295

##Changing the gene ID to gene name

296

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

296

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

297

colnames(ALZDAT) = ALZDAT1[1,]

297

colnames(ALZDAT) = ALZDAT1[1,]

298

299

300

##Adjusting the column names aka the gene names

300

##Adjusting the column names aka the gene names

301

colnames(ALZDAT) <- gcnames(ALZDAT)

301

colnames(ALZDAT) <- gcnames(ALZDAT)

302

303

304

#Full Data

304

#Full Data

305

Fullalzdw <- ALZDAT %>%

305

Fullalzdw <- ALZDAT %>%

306

as.data.frame() %>%

306

as.data.frame() %>%

307

cbind(ALZWORDF,.)

307

cbind(ALZWORDF,.)

308

309

310

#nfna <- strsplit(alz,"[\\|/]") %>%

310

#nfna <- strsplit(alz,"[\\|/]") %>%

311

# .[[1]] %>%

311

# .[[1]] %>%

312

# .[length(.)] %>%

312

# .[length(.)] %>%

313

# gsub("\\D","",.) %>%

313

# gsub("\\D","",.) %>%

314

# c("GSE",.,"after.txt") %>%

314

# c("GSE",.,"after.txt") %>%

315

# paste(collapse = "")

315

# paste(collapse = "")

316

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

316

#write.matrix(Fullalzdw,file = nfna,sep = "\t")

317

318

#Perfect for excel viewing

318

#Perfect for excel viewing

319

nfnaex <- strsplit(alz,"[\\]") %>%

319

nfnaex <- strsplit(alz,"[\\]") %>%

320

.[[1]] %>%

320

.[[1]] %>%

321

.[length(.)] %>%

321

.[length(.)] %>%

322

gsub("\\D","",.) %>%

322

gsub("\\D","",.) %>%

323

c("GSE",.,"aftexcel.txt") %>%

323

c("GSE",.,"aftexcel.txt") %>%

324

paste(collapse = "")

324

paste(collapse = "")

325

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

325

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

326

327

328

329

330

328

331

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Updated