Efrain Gonzalez / Cleaning and Fixing Data with R

1

##Posted 6/15/2017

1

##Posted 6/15/2017

2

3

4

#Libraries required to run the code

4

#Libraries required to run the code

5

library(pryr)

5

library(pryr)

6

library(MASS)

6

library(MASS)

7

library(dplyr)

7

library(dplyr)

8

library(tidyr)

8

library(tidyr)

9

library(readr)

9

library(readr)

10

library(stringr)

10

library(stringr)

11

12

13

#Necessary Functions

13

#Necessary Functions

14

#1#Function for handling the changing of row names and column names

14

#1#Function for handling the changing of row names and column names

15

chngrownm <- function(mat){

15

chngrownm <- function(mat){

16

row <- dim(mat)[1]

16

row <- dim(mat)[1]

17

col <- dim(mat)[2]

17

col <- dim(mat)[2]

18

j <- 1

18

j <- 1

19

x <- 1

19

x <- 1

20

p <- 1

20

p <- 1

21

a <- 1

21

a <- 1

22

b <- 1

22

b <- 1

23

g <- 1

23

g <- 1

24

for(j in 1:col){

24

for(j in 1:col){

25

if("!Sample_source_name_ch1"==mat[1,j]){

25

if("!Sample_source_name_ch1"==mat[1,j]){

26

colnames(mat)[j] <- "Brain_Region"

26

colnames(mat)[j] <- "Brain_Region"

27

}

27

}

28

if("!Sample_title" == mat[1,j]){

28

if("!Sample_title" == mat[1,j]){

29

colnames(mat)[j] <- "Title"

29

colnames(mat)[j] <- "Title"

30

}

30

}

31

if("!Sample_geo_accession" == mat[1,j]){

31

if("!Sample_geo_accession" == mat[1,j]){

32

colnames(mat)[j] <- "ID_REF"

32

colnames(mat)[j] <- "ID_REF"

33

} else{

33

} else{

34

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

34

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

35

colnames(mat)[j] <- paste0("Sex",x)

35

colnames(mat)[j] <- paste0("Sex",x)

36

x = x + 1

36

x = x + 1

37

}

37

}

38

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

38

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

39

colnames(mat)[j] <- paste0("PMI",p)

39

colnames(mat)[j] <- paste0("PMI",p)

40

p = p + 1

40

p = p + 1

41

}

41

}

42

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

42

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

43

colnames(mat)[j] <- paste0("Age",a)

43

colnames(mat)[j] <- paste0("Age",a)

44

a = a + 1

44

a = a + 1

45

}

45

}

46

if(grepl("braak|b&b",mat[2,j])==TRUE){

46

if(grepl("braak|b&b",mat[2,j])==TRUE){

47

colnames(mat)[j] <- paste0("Braak",b)

47

colnames(mat)[j] <- paste0("Braak",b)

48

b = b + 1

48

b = b + 1

49

}

49

}

50

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

50

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control|Normal",mat[2,j])==TRUE){

51

colnames(mat)[j] <- paste0("Group",g)

51

colnames(mat)[j] <- paste0("Group",g)

52

g = g + 1

52

g = g + 1

53

}

53

}

54

55

}

55

}

56

j = j + 1

56

j = j + 1

57

}

57

}

58

mat

58

mat

59

}

59

}

60

61

#2#Function for reorganizing information within the columns

61

#2#Function for reorganizing information within the columns

62

cinfo <- function(mat){

62

cinfo <- function(mat){

63

col <- dim(mat)[2]

63

col <- dim(mat)[2]

64

j <-2

64

j <-2

65

for(j in 2:col){

65

for(j in 2:col){

66

if(grepl("Group",colnames(mat)[j]) == TRUE){

66

if(grepl("Group",colnames(mat)[j]) == TRUE){

67

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

67

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

68

}

68

}

69

if(grepl("Age",colnames(mat)[j])==TRUE){

69

if(grepl("Age",colnames(mat)[j])==TRUE){

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

70

mat[,j] <- gsub("\\D","",mat[,j])%>%

71

as.integer()

71

as.integer()

72

}

72

}

73

if(grepl("Sex",colnames(mat)[j])==TRUE){

73

if(grepl("Sex",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub(".+:\\s","",mat[,j])

74

mat[,j] <- gsub(".+:\\s","",mat[,j])

75

}

75

}

76

if(grepl("PMI",colnames(mat)[j])==TRUE){

76

if(grepl("PMI",colnames(mat)[j])==TRUE){

77

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

77

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

78

as.numeric()

78

as.numeric()

79

}

79

}

80

if(grepl("Braak",colnames(mat)[j])==TRUE){

80

if(grepl("Braak",colnames(mat)[j])==TRUE){

81

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

81

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

82

as.roman()%>%

82

as.roman()%>%

83

as.integer()

83

as.integer()

84

}

84

}

85

j=j+1

85

j=j+1

86

}

86

}

87

mat

87

mat

88

}

88

}

89

90

#3#Function for labeling the gene IDs without names

90

#3#Function for labeling the gene IDs without names

91

NAFIXING <- function(GIDNAM){

91

NAFIXING <- function(GIDNAM){

92

row <- dim(GIDNAM)[1]

92

row <- dim(GIDNAM)[1]

93

i <- 1

93

i <- 1

94

for(i in 1:row){

94

for(i in 1:row){

95

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

95

if(grepl("^NA\\s*$",GIDNAM[i,2])==TRUE||is.na(GIDNAM[i,2])==TRUE){

96

GIDNAM[i,2] <- GIDNAM[i,1]

96

GIDNAM[i,2] <- GIDNAM[i,1]

97

}

97

}

98

i <- i + 1

98

i <- i + 1

99

}

99

}

100

GIDNAM

100

GIDNAM

101

}

101

}

102

103

#4#Function for changing the gene ID to gene name

103

#4#Function for changing the gene ID to gene name

104

cgeneID <- function(GeneName,DATA){

104

cgeneID <- function(GeneName,DATA){

105

colGene <- dim(GeneName)[2]

105

colGene <- dim(GeneName)[2]

106

j <- 1

106

j <- 1

107

for(j in 1:colGene){

107

for(j in 1:colGene){

108

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

108

chngsreq <- grep(paste0("^",GeneName[1,j],"$"),DATA[1,])

109

if(is.na(sum(chngsreq))==FALSE){

109

if(is.na(sum(chngsreq))==FALSE){

110

if(sum(chngsreq) > 0){

110

if(sum(chngsreq) > 0){

111

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

111

DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

112

}

112

}

113

}

113

}

114

#if(sum(chngsreq) > 0){

114

#if(sum(chngsreq) > 0){

115

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

115

##DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

116

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

116

#DATA[1,chngsreq] <- gsub(paste0("^",GeneName[1,j]),GeneName[2,j],DATA[1,chngsreq])

117

#}

117

#}

118

j = j+1

118

j = j+1

119

}

119

}

120

DATA

120

DATA

121

}

121

}

122

123

#5#Function for adjusting the gene names

123

#5#Function for adjusting the gene names

124

gcnames <- function(DiData,usecol=1){

124

gcnames <- function(DiData,usecol=1){

125

nuruns <- dim(DiData)[2]

125

nuruns <- dim(DiData)[2]

126

i = 1

126

i = 1

127

nwnam <- rep("0",length.out=nuruns)

127

nwnam <- rep("0",length.out=nuruns)

128

for(i in 1:nuruns){

128

for(i in 1:nuruns){

129

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

129

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

130

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

130

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][usecol])

131

} else{

131

} else{

132

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

132

nwnam[i]=str_trim(strsplit(colnames(DiData)[i],"///")[[1]][1])

133

}

133

}

134

135

}

135

}

136

nwnam

136

nwnam

137

138

}

138

}

139

140

#6# Function for discretizing the data

140

#6# Function for discretizing the data

141

dndat <- function(NDATA){

141

dndat <- function(NDATA){

142

rownd <- dim(NDATA)[1]

142

rownd <- dim(NDATA)[1]

143

colnd <- dim(NDATA)[2]

143

colnd <- dim(NDATA)[2]

144

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

144

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

145

colnames(DDATA) <- colnames(NDATA)

145

colnames(DDATA) <- colnames(NDATA)

146

i <- 1

146

i <- 1

147

for(i in 1:rownd){

147

for(i in 1:rownd){

148

j <- 1

148

j <- 1

149

for(j in 1:colnd){

149

for(j in 1:colnd){

150

if(is.na(NDATA[i,j])==FALSE){

150

if(is.na(NDATA[i,j])==FALSE){

151

152

if(NDATA[i,j] < -1){

152

if(NDATA[i,j] < -1){

153

DDATA[i,j]=0L

153

DDATA[i,j]=0L

154

}

154

}

155

if(NDATA[i,j] > 1){

155

if(NDATA[i,j] > 1){

156

DDATA[i,j]=2L

156

DDATA[i,j]=2L

157

}

157

}

158

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

158

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

159

DDATA[i,j]=1L

159

DDATA[i,j]=1L

160

}

160

}

161

} else{

161

} else{

162

DDATA[i,j] = NDATA[i,j]

162

DDATA[i,j] = NDATA[i,j]

163

}

163

}

164

j = j + 1

164

j = j + 1

165

}

165

}

166

i = i + 1

166

i = i + 1

167

}

167

}

168

DDATA

168

DDATA

169

}

169

}

170

171

172

#The Rest of this code will be used every time you want to change a data set

172

#The Rest of this code will be used every time you want to change a data set

173

174

#Getting the series matrix file

174

#Getting the series matrix file

175

print("Choose the series matrix file that you want to Analyze")

175

print("Choose the series matrix file that you want to Analyze")

176

alz <- file.choose()

176

alz <- file.choose()

177

178

#Getting the GPL file

178

#Getting the GPL file

179

print("Choose the GPL file that correlates with the above series matrix file")

179

print("Choose the GPL file that correlates with the above series matrix file")

180

genena <- file.choose()

180

genena <- file.choose()

181

182

183

#Find out if it is a soft GPL file or not

183

#Find out if it is a soft GPL file or not

184

soft <- strsplit(genena,"[\\|/]") %>%

184

soft <- strsplit(genena,"[\\|/]") %>%

185

.[[1]] %>%

185

.[[1]] %>%

186

.[length(.)] %>%

186

.[length(.)] %>%

187

grepl("soft|annot",.)

187

grepl("soft|annot",.)

188

189

#Working with the wordy part of the document

189

#Working with the wordy part of the document

190

alzword <- alz %>%

190

alzword <- alz %>%

191

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

191

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

192

filter(grepl("!Sample",X1))%>%

192

filter(grepl("!Sample",X1))%>%

193

filter(!grepl("!Sample_contact",X1))

193

filter(!grepl("!Sample_contact",X1))

194

195

##Changing row names and column names:

195

##Changing row names and column names:

196

ALZWORD <- t(alzword)

196

ALZWORD <- t(alzword)

197

rownames(ALZWORD)=NULL

197

rownames(ALZWORD)=NULL

198

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

198

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

199

ALZWORD <- chngrownm(ALZWORD)[-1,]

199

ALZWORD <- chngrownm(ALZWORD)[-1,]

200

ALZWORD <- ALZWORD%>%

200

ALZWORD <- ALZWORD%>%

201

as.data.frame()%>%

201

as.data.frame()%>%

202

dplyr::select(-starts_with("col"))

202

dplyr::select(-starts_with("col"))

203

204

##Reorganizing information within the columns

204

##Reorganizing information within the columns

205

ALZWORDF <- cinfo(ALZWORD)

205

ALZWORDF <- cinfo(ALZWORD)

206

207

208

#Working with Actual Data part of file

208

#Working with Actual Data part of file

209

alzdat <- alz %>%

209

alzdat <- alz %>%

210

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

210

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

211

ALZDAT <- t(alzdat[,-1])

211

ALZDAT <- t(alzdat[,-1])

212

rownames(ALZDAT)=NULL

212

rownames(ALZDAT)=NULL

213

214

##Is there a clean version of the GPL file available?

214

##Is there a clean version of the GPL file available?

215

gplnum <- strsplit(genena,"[\\|/]") %>%

215

gplnum <- strsplit(genena,"[\\|/]") %>%

216

.[[1]] %>%

216

.[[1]] %>%

217

.[length(.)] %>%

217

.[length(.)] %>%

218

gsub("\\D","",.)

218

gsub("\\D","",.)

219

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

219

clfileex <- sum(grepl(paste0("Clean_GPL",gplnum),list.files()))

220

if(clfileex >= 1){

220

if(clfileex >= 1){

221

#use the clean version

221

#use the clean version

222

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

222

geneIDNam <- paste0("Clean_GPL",gplnum,".txt") %>%

223

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

223

read_delim(delim="\t",col_names = c("ID","Symbol"), comment = "!")

224

225

}

225

}

226

if(clfileex == 0){

226

if(clfileex == 0){

227

##Lets Create a clean version

227

##Lets Create a clean version

228

229

##Gene ID to Gene Name

229

##Gene ID to Gene Name

230

if(soft == TRUE){

230

if(soft == TRUE){

231

#Check to see if there is already a file containing information on soft files

231

#Check to see if there is already a file containing information on soft files

232

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

232

fileex <- sum(grepl("GPL_ID_LOC.txt",list.files()))

233

if(fileex == 1){

233

if(fileex == 1){

234

#Check to see if this GPL soft file has been used before

234

#Check to see if this GPL soft file has been used before

235

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

235

IDF <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

236

.$GPL_FILE_NUM%>%

236

.$GPL_FILE_NUM%>%

237

grepl(gplnum,.) %>%

237

grepl(gplnum,.) %>%

238

sum()

238

sum()

239

if(IDF == 1){

239

if(IDF == 1){

240

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

240

IDLOCAL <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

241

.$GPL_FILE_NUM%>%

241

.$GPL_FILE_NUM%>%

242

grep(gplnum,.)

242

grep(gplnum,.)

243

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

243

idlocgpl <- read_delim("GPL_ID_LOC.txt",delim = "\t",col_names = TRUE) %>%

244

.$LOC_ID %>%

244

.$LOC_ID %>%

245

.[IDLOCAL]

245

.[IDLOCAL]

246

geneIDNam <- genena %>%

246

geneIDNam <- genena %>%

247

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

247

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idlocgpl) %>%

248

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

248

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

249

}

249

}

250

if(IDF == 0){

250

if(IDF == 0){

251

#No information on this particular GPL file

251

#No information on this particular GPL file

252

idLOCGPL <- genena %>%

252

idLOCGPL <- genena %>%

253

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

253

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

254

t(.) %>%

254

t(.) %>%

255

grep("^ID\\s*$",.) %>%

255

grep("^ID\\s*$",.) %>%

256

-1

256

-1

257

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

257

cbind(as.integer(gplnum),as.integer(idLOCGPL)) %>%

258

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

258

cat(file="GPL_ID_LOC.txt",sep = "\t", fill = TRUE, append = TRUE)

259

geneIDNam <- genena %>%

259

geneIDNam <- genena %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

260

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

261

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

262

}

262

}

263

}

263

}

264

if(fileex == 0){

264

if(fileex == 0){

265

#We must create a file that we can access for later use

265

#We must create a file that we can access for later use

266

idLOCGPL <- genena %>%

266

idLOCGPL <- genena %>%

267

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

267

read_delim(delim="\t",col_names = FALSE, comment = "!", n_max = 1000) %>%

268

t(.) %>%

268

t(.) %>%

269

grep("^ID\\s*$",.) %>%

269

grep("^ID\\s*$",.) %>%

270

-1

270

-1

271

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

271

Firstval <- cbind(as.integer(gplnum),as.integer(idLOCGPL))

272

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

272

colnames(Firstval) <- c("GPL_FILE_NUM","LOC_ID")

273

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

273

write.table(Firstval,file = "GPL_ID_LOC.txt", sep = "\t",row.names = FALSE, col.names = TRUE)

274

geneIDNam <- genena %>%

274

geneIDNam <- genena %>%

275

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

275

read_delim(delim="\t",col_names = TRUE, comment = "!", skip = idLOCGPL) %>%

276

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

276

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

277

}

277

}

278

}

278

}

279

if(soft == FALSE){

279

if(soft == FALSE){

280

geneIDNam <- genena %>%

280

geneIDNam <- genena %>%

281

read_delim(delim="\t",comment = "#")%>%

281

read_delim(delim="\t",comment = "#")%>%

282

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

282

dplyr::select(.,ID,grep("Symbol|^ORF\\s*$|^gene_assignment\\s*$",colnames(.)))

283

}

283

}

284

285

##Labeling the gene IDs without names

285

##Labeling the gene IDs without names

286

geneIDNam <- NAFIXING(geneIDNam)

286

geneIDNam <- NAFIXING(geneIDNam)

287

288

##remove the whitespace

288

##remove the whitespace

289

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

289

geneIDNam <- t(rbind(str_trim(t(geneIDNam)[1,]),str_trim(t(geneIDNam)[2,])))

290

291

##Here is the clean version

291

##Here is the clean version

292

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

292

write.table(geneIDNam,file = paste0("Clean_GPL",gplnum,".txt"),sep = "\t",row.names = FALSE, col.names = FALSE)

293

}

293

}

294

295

296

297

##Changing the gene ID to gene name

297

##Changing the gene ID to gene name

298

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

298

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

299

colnames(ALZDAT) = ALZDAT1[1,]

299

colnames(ALZDAT) = ALZDAT1[1,]

300

301

302

##Adjusting the column names aka the gene names

302

##Adjusting the column names aka the gene names

303

colnames(ALZDAT) <- gcnames(ALZDAT)

303

colnames(ALZDAT) <- gcnames(ALZDAT)

304

305

306

#Full RAW Data

306

#Full RAW Data

307

Fullalzdwr <- ALZDAT %>%

307

Fullalzdwr <- ALZDAT %>%

308

as.data.frame() %>%

308

as.data.frame() %>%

309

cbind(ALZWORDF,.)

309

cbind(ALZWORDF,.)

310

311

312

#Raw file is output

312

#Raw file is output

313

nfnaex <- strsplit(alz,"[\\]") %>%

313

nfnaex <- strsplit(alz,"[\\]") %>%

314

.[[1]] %>%

314

.[[1]] %>%

315

.[length(.)] %>%

315

.[length(.)] %>%

316

gsub("\\D","",.) %>%

316

gsub("\\D","",.) %>%

317

c("GSE",.,"aftexcel.txt") %>%

317

c("GSE",.,"aftexcel.txt") %>%

318

paste(collapse = "")

318

paste(collapse = "")

319

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

319

write.table(t(Fullalzdwr), file = nfnaex, sep = "\t")

320

321

322

#Now for the discretization part

322

#Now for the discretization part

323

##get the wordy part again

323

##get the wordy part again

324

rawword <- t(ALZWORDF)

324

rawword <- t(ALZWORDF)

325

326

##where is ID_REF located

326

##where is ID_REF located

327

hereim <- grep("ID_REF",rawword[,1])

327

hereim <- grep("ID_REF",rawword[,1])

328

329

##Subject Names GSM...

329

##Subject Names GSM...

330

subjnam <- rawword[hereim,]

330

subjnam <- rawword[hereim,]

331

332

##Getting the names for the rows

332

##Getting the names for the rows

333

namedarows <- rownames(rawword)[-hereim] %>%

333

namedarows <- rownames(rawword)[-hereim] %>%

334

as.data.frame()

334

as.data.frame()

335

RAWWORD <- rawword[-hereim,] %>%

335

RAWWORD <- rawword[-hereim,] %>%

336

as.data.frame() %>%

336

as.data.frame() %>%

337

bind_cols(namedarows,.)

337

bind_cols(namedarows,.)

338

z <- 1

338

z <- 1

339

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

339

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

340

for(z in 1:dim(RAWWORD)[1]){

340

for(z in 1:dim(RAWWORD)[1]){

341

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

341

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

342

z <- z + 1

342

z <- z + 1

343

}

343

}

344

345

colnames(naroww) <- "ROW_NAs"

345

colnames(naroww) <- "ROW_NAs"

346

RAWWORD <- bind_cols(RAWWORD,naroww)

346

RAWWORD <- bind_cols(RAWWORD,naroww)

347

348

349

roALZna <- t(ALZDAT) %>%

349

roALZna <- t(ALZDAT) %>%

350

rownames(.) %>%

350

rownames(.) %>%

351

as.data.frame(.)

351

as.data.frame(.)

352

colnames(roALZna) <- "ID_REF"

352

colnames(roALZna) <- "ID_REF"

353

354

RAWDAT <- t(ALZDAT) %>%

354

RAWDAT <- t(ALZDAT) %>%

355

as.data.frame(.)

355

as.data.frame(.)

356

colnames(RAWDAT) <- NULL

356

colnames(RAWDAT) <- NULL

357

rownames(RAWDAT) <- NULL

357

rownames(RAWDAT) <- NULL

358

359

RAWDAT2 <- RAWDAT %>%

359

RAWDAT2 <- RAWDAT %>%

360

cbind(roALZna,.) %>%

360

cbind(roALZna,.) %>%

361

dplyr::arrange(.,ID_REF)

361

dplyr::arrange(.,ID_REF)

362

363

##Editing the file for R processing

363

##Editing the file for R processing

364

RAWDATID <- RAWDAT2[,1] %>%

364

RAWDATID <- RAWDAT2[,1] %>%

365

as.matrix(.)

365

as.matrix(.)

366

367

RAWDATNUM <- RAWDAT2[,-1] %>%

367

RAWDATNUM <- RAWDAT2[,-1] %>%

368

mapply(.,FUN = as.numeric) %>%

368

mapply(.,FUN = as.numeric) %>%

369

t(.)

369

t(.)

370

371

##Consolidating genes with the same name

371

##Consolidating genes with the same name

372

###create empty matrix of size equal to tabRDATID

372

###create empty matrix of size equal to tabRDATID

373

tabRDATID <- table(RAWDATID)

373

tabRDATID <- table(RAWDATID)

374

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

374

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

375

j <- 1

375

j <- 1

376

for(j in 1:length(tabRDATID)){

376

for(j in 1:length(tabRDATID)){

377

378

##Putting the ones without duplicates in their new homes

378

##Putting the ones without duplicates in their new homes

379

if(tabRDATID[j] == 1){

379

if(tabRDATID[j] == 1){

380

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

380

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

381

}

381

}

382

##Averaging duplicates and putting them in their new homes

382

##Averaging duplicates and putting them in their new homes

383

if(tabRDATID[j] > 1){

383

if(tabRDATID[j] > 1){

384

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

384

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

385

}

385

}

386

j <- j + 1

386

j <- j + 1

387

}

387

}

388

389

##Scaling the Data

389

##Scaling the Data

390

scrawdat <- NuRDATN%>%

390

scrawdat <- NuRDATN%>%

391

scale()

391

scale()

392

attr(scrawdat,"scaled:center") <- NULL

392

attr(scrawdat,"scaled:center") <- NULL

393

attr(scrawdat,"scaled:scale") <- NULL

393

attr(scrawdat,"scaled:scale") <- NULL

394

colnames(scrawdat) <- rownames(tabRDATID)

394

colnames(scrawdat) <- rownames(tabRDATID)

395

396

##Discretized the Data

396

##Discretized the Data

397

dialzdat <- scrawdat %>%

397

dialzdat <- scrawdat %>%

398

dndat(.) %>%

398

dndat(.) %>%

399

t()%>%

399

t()%>%

400

as.data.frame(.)

400

as.data.frame(.)

401

colnames(dialzdat) <- rownames(RAWDATNUM)

401

colnames(dialzdat) <- rownames(RAWDATNUM)

402

403

##setting "ID_REF" as a new variable

403

##setting "ID_REF" as a new variable

404

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

404

geneNAM <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

405

colnames(geneNAM) <- "ID_REF"

405

colnames(geneNAM) <- "ID_REF"

406

rownames(dialzdat) <- NULL

406

rownames(dialzdat) <- NULL

407

dialzdat <-bind_cols(geneNAM,dialzdat)

407

dialzdat <-bind_cols(geneNAM,dialzdat)

408

409

##NAs in a column

409

##NAs in a column

410

x <- 2

410

x <- 2

411

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

411

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

412

nacol[1,1] = "COL_NAs"

412

nacol[1,1] = "COL_NAs"

413

for(x in 2:dim(dialzdat)[2]){

413

for(x in 2:dim(dialzdat)[2]){

414

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

414

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

415

x <- x + 1

415

x <- x + 1

416

}

416

}

417

colnames(nacol) <- colnames(dialzdat)

417

colnames(nacol) <- colnames(dialzdat)

418

dialzdat<-bind_rows(dialzdat,nacol)

418

dialzdat<-bind_rows(dialzdat,nacol)

419

420

##NAs in a row

420

##NAs in a row

421

y <- 1

421

y <- 1

422

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

422

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

423

for(y in 1:dim(dialzdat)[1]){

423

for(y in 1:dim(dialzdat)[1]){

424

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

424

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

425

y <- y + 1

425

y <- y + 1

426

}

426

}

427

colnames(narowd) <- "ROW_NAs"

427

colnames(narowd) <- "ROW_NAs"

428

dialzdat <- bind_cols(dialzdat,narowd)

428

dialzdat <- bind_cols(dialzdat,narowd)

429

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

429

colnames(dialzdat)[2:(dim(dialzdat)[2]-1)] <- subjnam

430

colnames(RAWWORD) <- colnames(dialzdat)

430

colnames(RAWWORD) <- colnames(dialzdat)

431

##converting to character so that the clinical can be brought together with discrete data

431

##converting to character so that the clinical can be brought together with discrete data

432

k <- 2

432

k <- 2

433

for(k in 2:dim(dialzdat)[2]-1){

433

for(k in 2:dim(dialzdat)[2]-1){

434

dialzdat[,k] <- as.character(dialzdat[,k])

434

dialzdat[,k] <- as.character(dialzdat[,k])

435

k <- k + 1

435

k <- k + 1

436

}

436

}

437

#The End the full data

437

#The End the full data

438

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

438

Dscrtalzdw <- bind_rows(RAWWORD,dialzdat)

439

440

#Produces Discrete file

440

#Produces Discrete file

441

nfnaex <- strsplit(rawdat,"[\\|/]") %>%

441

nfnaex <- strsplit(rawdat,"[\\|/]") %>%

442

.[[1]] %>%

442

.[[1]] %>%

443

.[length(.)] %>%

443

.[length(.)] %>%

444

gsub("\\D","",.) %>%

444

gsub("\\D","",.) %>%

445

c("GSE",.,"dscrt.txt") %>%

445

c("GSE",.,"dscrt.txt") %>%

446

paste(collapse = "")

446

paste(collapse = "")

447

write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)

447

write.table(Dscrtalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)

448

449

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Update (UNTESTED)