Efrain Gonzalez / Cleaning and Fixing Data with R

1

#For Reading Raw Data from the created file

1

#For Reading Raw Data from the created file

2

3

#Required Libraries

3

#Required Libraries

4

library(MASS)

4

library(MASS)

5

library(dplyr)

5

library(dplyr)

6

library(tidyr)

6

library(tidyr)

7

library(readr)

7

library(readr)

8

library(stringr)

8

library(stringr)

9

10

11

#Necessary Functions

11

#Necessary Functions

12

13

#1# Function for discretizing the data

13

#1# Function for discretizing the data

14

dndat <- function(NDATA){

14

dndat <- function(NDATA){

15

rownd <- dim(NDATA)[1]

15

rownd <- dim(NDATA)[1]

16

colnd <- dim(NDATA)[2]

16

colnd <- dim(NDATA)[2]

17

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

17

DDATA <- matrix(0,nrow=rownd,ncol=colnd)

18

colnames(DDATA) <- colnames(NDATA)

18

colnames(DDATA) <- colnames(NDATA)

19

i = 1

19

i = 1

20

for(i in 1:rownd){

20

for(i in 1:rownd){

21

j <- 1

21

for(j in 1:colnd){

22

for(j in 1:colnd){

22

if(is.na(NDATA[i,j])==FALSE){

23

if(is.na(NDATA[i,j])==FALSE){

23

24

if(NDATA[i,j] < -1){

25

if(NDATA[i,j] < -1){

25

DDATA[i,j]=0L

26

DDATA[i,j]=0L

26

}

27

}

27

if(NDATA[i,j] > 1){

28

if(NDATA[i,j] > 1){

28

DDATA[i,j]=2L

29

DDATA[i,j]=2L

29

}

30

}

30

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

31

if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){

31

DDATA[i,j]=1L

32

DDATA[i,j]=1L

32

}

33

}

33

} else{

34

} else{

34

DDATA[i,j] = NDATA[i,j]

35

DDATA[i,j] = NDATA[i,j]

35

}

36

}

36

j = j + 1

37

j = j + 1

37

}

38

}

38

i = i + 1

39

i = i + 1

39

}

40

}

40

DDATA

41

DDATA

41

}

42

}

42

43

44

#Bringing in the file

45

#Bringing in the file

45

rawdat <- file.choose()

46

rawdat <- file.choose()

46

RAWDAT <- rawdat %>%

47

RAWDAT <- rawdat %>%

47

read_delim(delim ="\t",col_names = FALSE,skip=1) %>%

48

read_delim(delim ="\t",col_names = FALSE,skip=1) %>%

48

filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))

49

filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))

49

attributes(RAWDAT)$names <- RAWDAT[1,]

50

attributes(RAWDAT)$names <- RAWDAT[1,]

50

51

#Just the clinical data

52

#Just the clinical data

52

RAWWORD <- rawdat %>%

53

RAWWORD <- rawdat %>%

53

read_delim(delim ="\t",col_names = FALSE,skip=1) %>%

54

read_delim(delim ="\t",col_names = FALSE,skip=1) %>%

54

filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))

55

filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))

55

attributes(RAWWORD)$names <- RAWDAT[1,]

56

attributes(RAWWORD)$names <- RAWDAT[1,]

56

#Add col of NAs to clinical data

57

#Add col of NAs to clinical data

57

z <- 1

58

z <- 1

58

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

59

naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)

59

for(z in 1:dim(RAWWORD)[1]){

60

for(z in 1:dim(RAWWORD)[1]){

60

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

61

naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))

61

z <- z + 1

62

z <- z + 1

62

}

63

}

63

colnames(naroww) <- "ROW_NAs"

64

colnames(naroww) <- "ROW_NAs"

64

RAWWORD <- bind_cols(RAWWORD,naroww)

65

RAWWORD <- bind_cols(RAWWORD,naroww)

65

66

67

##Getting back to the data

68

##Getting back to the data

68

RAWDAT2 <- RAWDAT[-1,] %>%

69

RAWDAT2 <- RAWDAT[-1,] %>%

69

dplyr::arrange(.,ID_REF)

70

dplyr::arrange(.,ID_REF)

70

71

##Editing the file for R processing

72

##Editing the file for R processing

72

RAWDATID <- RAWDAT2[,1] %>%

73

RAWDATID <- RAWDAT2[,1] %>%

73

as.matrix(.)

74

as.matrix(.)

74

RAWDATNUM <- RAWDAT2[,-1] %>%

75

RAWDATNUM <- RAWDAT2[,-1] %>%

75

mapply(.,FUN = as.numeric) %>%

76

mapply(.,FUN = as.numeric) %>%

76

t(.)

77

t(.)

77

78

##Consolidating genes with the same name

79

##Consolidating genes with the same name

79

tabRDATID <- table(RAWDATID)

80

tabRDATID <- table(RAWDATID)

80

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

81

NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))

81

j <- 1

82

j <- 1

82

for(j in 1:length(tabRDATID)){

83

for(j in 1:length(tabRDATID)){

83

##Putting the ones without duplicates in their new homes

84

##Putting the ones without duplicates in their new homes

84

if(tabRDATID[j] == 1){

85

if(tabRDATID[j] == 1){

85

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

86

NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]

86

}

87

}

87

##Averaging duplicates and putting them in their new homes

88

##Averaging duplicates and putting them in their new homes

88

if(tabRDATID[j] > 1){

89

if(tabRDATID[j] > 1){

89

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

90

NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)

90

}

91

}

91

j <- j + 1

92

j <- j + 1

92

}

93

}

93

94

95

#Scaling the Data

96

#Scaling the Data

96

scrawdat <- NuRDATN%>%

97

scrawdat <- NuRDATN%>%

97

scale()

98

scale()

98

attr(scrawdat,"scaled:center") <- NULL

99

attr(scrawdat,"scaled:center") <- NULL

99

attr(scrawdat,"scaled:scale") <- NULL

100

attr(scrawdat,"scaled:scale") <- NULL

100

colnames(scrawdat) <- rownames(tabRDATID)

101

colnames(scrawdat) <- rownames(tabRDATID)

101

102

103

#Discretized the Data

104

#Discretized the Data

104

dialzdat <- scrawdat %>%

105

dialzdat <- scrawdat %>%

105

dndat(.) %>%

106

dndat(.) %>%

106

t()%>%

107

t()%>%

107

as.data.frame(.)

108

as.data.frame(.)

108

colnames(dialzdat) <- rownames(RAWDATNUM)

109

colnames(dialzdat) <- rownames(RAWDATNUM)

109

110

#gene names

111

#gene names

111

genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

112

genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))

112

#setting "ID_REF" as a new variable

113

#setting "ID_REF" as a new variable

113

colnames(genena) <- "ID_REF"

114

colnames(genena) <- "ID_REF"

114

rownames(dialzdat) <- NULL

115

rownames(dialzdat) <- NULL

115

dialzdat <-bind_cols(genena,dialzdat)

116

dialzdat <-bind_cols(genena,dialzdat)

116

117

#NAs in a column

118

#NAs in a column

118

x <- 2

119

x <- 2

119

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

120

nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)

120

nacol[1,1] = "COL_NAs"

121

nacol[1,1] = "COL_NAs"

121

for(x in 2:dim(dialzdat)[2]){

122

for(x in 2:dim(dialzdat)[2]){

122

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

123

nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))

123

x <- x + 1

124

x <- x + 1

124

}

125

}

125

colnames(nacol) <- colnames(dialzdat)

126

colnames(nacol) <- colnames(dialzdat)

126

dialzdat<-bind_rows(dialzdat,nacol)

127

dialzdat<-bind_rows(dialzdat,nacol)

127

128

#NAs in a row

129

#NAs in a row

129

y <- 1

130

y <- 1

130

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

131

narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)

131

for(y in 1:dim(dialzdat)[1]){

132

for(y in 1:dim(dialzdat)[1]){

132

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

133

narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))

133

y <- y + 1

134

y <- y + 1

134

}

135

}

135

colnames(narowd) <- "ROW_NAs"

136

colnames(narowd) <- "ROW_NAs"

136

dialzdat <- bind_cols(dialzdat,narowd)

137

dialzdat <- bind_cols(dialzdat,narowd)

137

138

#converting to character so that the clinical can be brought together with discrete data

139

#converting to character so that the clinical can be brought together with discrete data

139

k <- 2

140

k <- 2

140

for(k in 2:dim(dialzdat)[2]-1){

141

for(k in 2:dim(dialzdat)[2]-1){

141

dialzdat[,k] <- as.character(dialzdat[,k])

142

dialzdat[,k] <- as.character(dialzdat[,k])

142

k <- k + 1

143

k <- k + 1

143

}

144

}

144

145

146

#The End the full data

147

#The End the full data

147

Fullalzdw <- bind_rows(RAWWORD,dialzdat)

148

Fullalzdw <- bind_rows(RAWWORD,dialzdat)

148

149

#Create the file

150

#Create the file

150

nfnaex <- strsplit(rawdat,"[\\|/]") %>%

151

nfnaex <- strsplit(rawdat,"[\\|/]") %>%

151

.[[1]] %>%

152

.[[1]] %>%

152

.[length(.)] %>%

153

.[length(.)] %>%

153

gsub("\\D","",.) %>%

154

gsub("\\D","",.) %>%

154

c("GSE",.,"dscrt.txt") %>%

155

c("GSE",.,"dscrt.txt") %>%

155

paste(collapse = "")

156

paste(collapse = "")

156

write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)

157

write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)

157

158

159

GITLAB

Efrain Gonzalez / Cleaning and Fixing Data with R

Update

 #For Reading Raw Data from the created file
 #Required Libraries
 library(MASS)
 library(dplyr)
 library(tidyr)
 library(readr)
 library(stringr)
 #Necessary Functions
 #1# Function for discretizing the data
 dndat <- function(NDATA){
 	rownd <- dim(NDATA)[1]
 	colnd <- dim(NDATA)[2]
 	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
 	colnames(DDATA) <- colnames(NDATA)
 	i = 1
 	for(i in 1:rownd){
+	    j <- 1
 		for(j in 1:colnd){
 			if(is.na(NDATA[i,j])==FALSE){
 				if(NDATA[i,j] < -1){
 					DDATA[i,j]=0L
 				}
 				if(NDATA[i,j] > 1){
 					DDATA[i,j]=2L
 				}
 				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
 					DDATA[i,j]=1L
 				}
 			} else{
 				DDATA[i,j] = NDATA[i,j]
 			}
 			j = j + 1
 		}
 		i = i + 1
 	}
 	DDATA
 }
 #Bringing in the file
 rawdat <- file.choose()
 RAWDAT <- rawdat %>%
 	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
 	filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
 attributes(RAWDAT)$names <- RAWDAT[1,]
 #Just the clinical data
 RAWWORD <- rawdat %>%
 	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
 	filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
 attributes(RAWWORD)$names <- RAWDAT[1,]
 #Add col of NAs to clinical data
 z <- 1
 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
 for(z in 1:dim(RAWWORD)[1]){
 	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
 	z <- z + 1
 }
 colnames(naroww) <- "ROW_NAs"
 RAWWORD <- bind_cols(RAWWORD,naroww)
 ##Getting back to the data
 RAWDAT2 <- RAWDAT[-1,] %>%
 	dplyr::arrange(.,ID_REF)
 ##Editing the file for R processing
 RAWDATID <- RAWDAT2[,1] %>%
 	as.matrix(.)
 RAWDATNUM <- RAWDAT2[,-1] %>%
 	mapply(.,FUN = as.numeric) %>%
 	t(.)
 ##Consolidating genes with the same name
 tabRDATID <- table(RAWDATID)
 NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
 j <- 1
 for(j in 1:length(tabRDATID)){
 	##Putting the ones without duplicates in their new homes
 	if(tabRDATID[j] == 1){
 		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
 	}
 	##Averaging duplicates and putting them in their new homes
 	if(tabRDATID[j] > 1){
 		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
 	}
 	j <- j + 1
 }
 #Scaling the Data
 scrawdat <- NuRDATN%>%
 	scale()
 attr(scrawdat,"scaled:center") <- NULL
 attr(scrawdat,"scaled:scale") <- NULL
 colnames(scrawdat) <- rownames(tabRDATID)
 #Discretized the Data
 dialzdat <- scrawdat %>%
 	dndat(.) %>%
 	t()%>%
 	as.data.frame(.)
 colnames(dialzdat) <- rownames(RAWDATNUM)
 #gene names
 genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
 #setting "ID_REF" as a new variable
 colnames(genena) <- "ID_REF"
 rownames(dialzdat) <- NULL
 dialzdat <-bind_cols(genena,dialzdat)
 #NAs in a column
 x <- 2
 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
 nacol[1,1] = "COL_NAs"
 for(x in 2:dim(dialzdat)[2]){
 	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
 	x <- x + 1
 }
 colnames(nacol) <- colnames(dialzdat)
 dialzdat<-bind_rows(dialzdat,nacol)
 #NAs in a row
 y <- 1
 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
 for(y in 1:dim(dialzdat)[1]){
 	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
 	y <- y + 1
 }
 colnames(narowd) <- "ROW_NAs"
 dialzdat <- bind_cols(dialzdat,narowd)
 #converting to character so that the clinical can be brought together with discrete data
 k <- 2
 for(k in 2:dim(dialzdat)[2]-1){
 	dialzdat[,k] <- as.character(dialzdat[,k])
 	k <- k + 1
 }
 #The End the full data
 Fullalzdw <- bind_rows(RAWWORD,dialzdat)
 #Create the file
 nfnaex <- strsplit(rawdat,"[\\|/]") %>%
 	.[[1]] %>%
 	.[length(.)] %>%
 	gsub("\\D","",.) %>%
 	c("GSE",.,"dscrt.txt") %>%
 	paste(collapse = "")
 write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)