Efrain Gonzalez / Cleaning and Fixing Data with R

1

#Libraries required to run the code

1

#Libraries required to run the code

2

library(MASS)

2

library(MASS)

3

library(pryr)

3

library(pryr)

4

library(dplyr)

4

library(dplyr)

5

library(tidyr)

5

library(tidyr)

6

library(readr)

6

library(readr)

7

library(stringr)

7

library(stringr)

8

9

10

#Necessary Functions

10

#Necessary Functions

11

#1#Function for handling the changing of row names and column names

11

#1#Function for handling the changing of row names and column names

12

chngrownm <- function(mat){

12

chngrownm <- function(mat){

13

row <- dim(mat)[1]

13

row <- dim(mat)[1]

14

col <- dim(mat)[2]

14

col <- dim(mat)[2]

15

j <- 1

15

j <- 1

16

x <- 1

16

x <- 1

17

p <- 1

17

p <- 1

18

a <- 1

18

a <- 1

19

b <- 1

19

b <- 1

20

g <- 1

20

g <- 1

21

for(j in 1:col){

21

for(j in 1:col){

22

if("!Sample_source_name_ch1"==mat[1,j]){

22

if("!Sample_source_name_ch1"==mat[1,j]){

23

colnames(mat)[j] <- "Brain_Region"

23

colnames(mat)[j] <- "Brain_Region"

24

}

24

}

25

if("!Sample_title" == mat[1,j]){

25

if("!Sample_title" == mat[1,j]){

26

colnames(mat)[j] <- "Title"

26

colnames(mat)[j] <- "Title"

27

}

27

}

28

if("!Sample_geo_accession" == mat[1,j]){

28

if("!Sample_geo_accession" == mat[1,j]){

29

colnames(mat)[j] <- "ID_REF"

29

colnames(mat)[j] <- "ID_REF"

30

} else{

30

} else{

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

31

if(grepl("Sex|gender|Gender|sex",mat[2,j])==TRUE){

32

colnames(mat)[j] <- paste0("Sex",x)

32

colnames(mat)[j] <- paste0("Sex",x)

33

x = x + 1

33

x = x + 1

34

}

34

}

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

35

if(grepl("postmorteminterval|PMI|pmi",mat[2,j])==TRUE){

36

colnames(mat)[j] <- paste0("PMI",p)

36

colnames(mat)[j] <- paste0("PMI",p)

37

p = p + 1

37

p = p + 1

38

}

38

}

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

39

if(grepl("age|Age|AGE",mat[2,j])==TRUE){

40

colnames(mat)[j] <- paste0("Age",a)

40

colnames(mat)[j] <- paste0("Age",a)

41

a = a + 1

41

a = a + 1

42

}

42

}

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

43

if(grepl("braak|b&b",mat[2,j])==TRUE){

44

colnames(mat)[j] <- paste0("Braak",b)

44

colnames(mat)[j] <- paste0("Braak",b)

45

b = b + 1

45

b = b + 1

46

}

46

}

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

47

if(grepl("group|disease|control|AD|normal|diagnosis|Alzheimer|Control",mat[2,j])==TRUE){

48

colnames(mat)[j] <- paste0("Group",g)

48

colnames(mat)[j] <- paste0("Group",g)

49

g = g + 1

49

g = g + 1

50

}

50

}

51

52

}

52

}

53

j = j + 1

53

j = j + 1

54

}

54

}

55

mat

55

mat

56

}

56

}

57

58

#2#Function for reorganizing information within the columns

58

#2#Function for reorganizing information within the columns

59

cinfo <- function(mat){

59

cinfo <- function(mat){

60

col <- dim(mat)[2]

60

col <- dim(mat)[2]

61

j <-2

61

j <-2

62

for(j in 2:col){

62

for(j in 2:col){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

63

if(grepl("Group",colnames(mat)[j]) == TRUE){

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

64

mat[,j] <- gsub(".+:\\s|\\s.+;.+","",mat[,j])

65

}

65

}

66

if(grepl("Age",colnames(mat)[j])==TRUE){

66

if(grepl("Age",colnames(mat)[j])==TRUE){

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

67

mat[,j] <- gsub("\\D","",mat[,j])%>%

68

as.integer()

68

as.integer()

69

}

69

}

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

70

if(grepl("Sex",colnames(mat)[j])==TRUE){

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

71

mat[,j] <- gsub(".+:\\s","",mat[,j])

72

}

72

}

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

73

if(grepl("PMI",colnames(mat)[j])==TRUE){

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

74

mat[,j] <- gsub("[^0-9\\.]","",mat[,j])%>%

75

as.numeric()

75

as.numeric()

76

}

76

}

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

77

if(grepl("Braak",colnames(mat)[j])==TRUE){

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

78

mat[,j]<-gsub(".+:\\s","",mat[,j])%>%

79

as.roman()%>%

79

as.roman()%>%

80

as.integer()

80

as.integer()

81

}

81

}

82

j=j+1

82

j=j+1

83

}

83

}

84

mat

84

mat

85

}

85

}

86

87

#3#Function for changing the gene ID to gene name

87

#3#Function for changing the gene ID to gene name

88

cgeneID <- function(GeneName,DATA){

88

cgeneID <- function(GeneName,DATA){

89

colGene <- dim(GeneName)[2]

89

colGene <- dim(GeneName)[2]

90

j <- 1

90

j <- 1

91

for(j in 1:colGene){

91

for(j in 1:colGene){

92

chngsreq <- grep(GeneName[1,j],DATA[1,])

92

chngsreq <- grep(GeneName[1,j],DATA[1,])

93

DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

93

DATA[1,chngsreq] <- gsub(GeneName[1,j],GeneName[2,j],DATA[1,chngsreq])

94

j = j+1

94

j = j+1

95

}

95

}

96

DATA

96

DATA

97

}

97

}

98

99

#4#Function for adjusting the gene names

99

#4#Function for adjusting the gene names

100

gcnames <- function(DiData,usecol=1){

100

gcnames <- function(DiData,usecol=1){

101

nuruns <- dim(DiData)[2]

101

nuruns <- dim(DiData)[2]

102

i = 1

102

i = 1

103

nwnam <- rep("0",length.out=nuruns)

103

nwnam <- rep("0",length.out=nuruns)

104

for(i in 1:nuruns){

104

for(i in 1:nuruns){

105

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

105

if(length(strsplit(colnames(DiData)[i],"///")[[1]]) >= usecol){

106

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

106

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][usecol]

107

} else{

107

} else{

108

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

108

nwnam[i]=strsplit(colnames(DiData)[i],"///")[[1]][1]

109

}

109

}

110

111

}

111

}

112

nwnam

112

nwnam

113

114

}

114

}

115

116

117

118

#The Rest of this code will be used every time you want to change a data set

118

#The Rest of this code will be used every time you want to change a data set

119

120

#Getting the series matrix file

120

#Getting the series matrix file

121

print("Choose the series matrix file that you want to Analyze")

121

print("Choose the series matrix file that you want to Analyze")

122

alz <- file.choose()

122

alz <- file.choose()

123

124

#Getting the GPL file

124

#Getting the GPL file

125

print("Choose the GPL file that correlates with the above series matrix file")

125

print("Choose the GPL file that correlates with the above series matrix file")

126

genena <- file.choose()

126

genena <- file.choose()

127

128

129

#Set working directory based on the directory of the series matrix file

129

#Set working directory based on the directory of the series matrix file

130

##strsplit(alz,"[\\]") %>%

130

##strsplit(alz,"[\\]") %>%

131

## .[[1]] %>%

131

## .[[1]] %>%

132

## .[-length(.)] %>%

132

## .[-length(.)] %>%

133

## paste(.,collapse="/") %>%

133

## paste(.,collapse="/") %>%

134

## setwd()

134

## setwd()

135

136

137

#Working with the wordy part of the document

137

#Working with the wordy part of the document

138

alzword <- alz %>%

138

alzword <- alz %>%

139

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

139

read_delim(delim ="\t",comment = "!Series",col_names = FALSE)%>%

140

filter(grepl("!Sample",X1))%>%

140

filter(grepl("!Sample",X1))%>%

141

filter(!grepl("!Sample_contact",X1))

141

filter(!grepl("!Sample_contact",X1))

142

143

##Changing row names and column names:

143

##Changing row names and column names:

144

ALZWORD <- t(alzword)

144

ALZWORD <- t(alzword)

145

rownames(ALZWORD)=NULL

145

rownames(ALZWORD)=NULL

146

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

146

colnames(ALZWORD) <- colnames(ALZWORD,do.NULL=FALSE)

147

ALZWORD <- chngrownm(ALZWORD)[-1,]

147

ALZWORD <- chngrownm(ALZWORD)[-1,]

148

ALZWORD <- ALZWORD%>%

148

ALZWORD <- ALZWORD%>%

149

as.data.frame()%>%

149

as.data.frame()%>%

150

dplyr::select(-starts_with("col"))

150

dplyr::select(-starts_with("col"))

151

152

##Reorganizing information within the columns

152

##Reorganizing information within the columns

153

ALZWORDF <- cinfo(ALZWORD)

153

ALZWORDF <- cinfo(ALZWORD)

154

155

156

#Working with Actual Data part of file

156

#Working with Actual Data part of file

157

alzdat <- alz %>%

157

alzdat <- alz %>%

158

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

158

read_delim(delim="\t",col_names=TRUE,comment = "!",skip=1)

159

ALZDAT <- t(alzdat[,-1])

159

ALZDAT <- t(alzdat[,-1])

160

rownames(ALZDAT)=NULL

160

rownames(ALZDAT)=NULL

161

162

163

##Gene ID to Gene Name

163

##Gene ID to Gene Name

164

geneIDNam <- genena %>%

164

geneIDNam <- genena %>%

165

read_delim(delim="\t",comment = "#")%>%

165

read_delim(delim="\t",comment = "#")%>%

166

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

166

dplyr::select(.,ID,grep("Symbol|ORF",colnames(.)))

167

168

##Changing the ID to a Name

168

##Changing the ID to a Name

169

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

169

ALZDAT1 <- cgeneID(t(geneIDNam),t(alzdat))

170

colnames(ALZDAT) = ALZDAT1[1,]

170

colnames(ALZDAT) = ALZDAT1[1,]

171

172

173

##Adjusting the column names aka the gene names

173

##Adjusting the column names aka the gene names

174

colnames(ALZDAT) <- gcnames(ALZDAT)

174

colnames(ALZDAT) <- gcnames(ALZDAT)

175

176

177

#Full Data

177

#Full Data

178

Fullalzdw <- ALZDAT %>%

178

Fullalzdw <- ALZDAT %>%

179

as.data.frame() %>%

179

as.data.frame() %>%

180

cbind(ALZWORDF,.)

180

cbind(ALZWORDF,.)

181

182

##since the order in which the packages are added matters I moved this package to the top

182

##since the order in which the packages are added matters I moved this package to the top

183

##library(MASS)

183

##library(MASS)

184

nfna <- strsplit(alz,"[\\]") %>%

184

nfna <- strsplit(alz,"[\\]") %>%

185

.[[1]] %>%

185

.[[1]] %>%

186

.[length(.)] %>%

186

.[length(.)] %>%

187

gsub("\\D","",.) %>%

187

gsub("\\D","",.) %>%

188

c("GSE",.,"after.txt") %>%

188

c("GSE",.,"after.txt") %>%

189

paste(collapse = "")

189

paste(collapse = "")

190

MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")

190

MASS::write.matrix(Fullalzdw,file = nfna,sep = "\t")

191

#Perfect for excel viewing

191

#Perfect for excel viewing

192

nfnaex <- strsplit(alz,"[\\]") %>%

192

nfnaex <- strsplit(alz,"[\\]") %>%

193

.[[1]] %>%

193

.[[1]] %>%

194

.[length(.)] %>%

194

.[length(.)] %>%

195

gsub("\\D","",.) %>%

195

gsub("\\D","",.) %>%

196

c("GSE",.,"aftexcel.txt") %>%

196

c("GSE",.,"aftexcel.txt") %>%

197

paste(collapse = "")

197

paste(collapse = "")

198

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

198

write.table(t(Fullalzdw), file = nfnaex, sep = "\t")

199

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Added folder