RPostClean.R 3.64 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158


#For Reading Raw Data from the created file

#Required Libraries
library(MASS)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)


#Necessary Functions

#1# Function for discretizing the data 
dndat <- function(NDATA){
	rownd <- dim(NDATA)[1]
	colnd <- dim(NDATA)[2]
	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	colnames(DDATA) <- colnames(NDATA)
	i = 1
	for(i in 1:rownd){
	    j <- 1
		for(j in 1:colnd){
			if(is.na(NDATA[i,j])==FALSE){
			
				if(NDATA[i,j] < -1){
					DDATA[i,j]=0L
				}
				if(NDATA[i,j] > 1){
					DDATA[i,j]=2L
				}
				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
					DDATA[i,j]=1L
				}
			} else{
				DDATA[i,j] = NDATA[i,j]
			}
			j = j + 1			
		}
		i = i + 1
	}
	DDATA
}


#Bringing in the file
rawdat <- file.choose()
RAWDAT <- rawdat %>% 
	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
attributes(RAWDAT)$names <- RAWDAT[1,]

#Just the clinical data
RAWWORD <- rawdat %>%
	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
attributes(RAWWORD)$names <- RAWDAT[1,]
#Add col of NAs to clinical data
z <- 1
naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
for(z in 1:dim(RAWWORD)[1]){
	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	z <- z + 1
}
colnames(naroww) <- "ROW_NAs"
RAWWORD <- bind_cols(RAWWORD,naroww)


##Getting back to the data
RAWDAT2 <- RAWDAT[-1,] %>%
	dplyr::arrange(.,ID_REF)

##Editing the file for R processing
RAWDATID <- RAWDAT2[,1] %>%
	as.matrix(.)
RAWDATNUM <- RAWDAT2[,-1] %>%
	mapply(.,FUN = as.numeric) %>%
	t(.)

##Consolidating genes with the same name
tabRDATID <- table(RAWDATID)
NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
j <- 1
for(j in 1:length(tabRDATID)){
	##Putting the ones without duplicates in their new homes
	if(tabRDATID[j] == 1){
		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	}
	##Averaging duplicates and putting them in their new homes
	if(tabRDATID[j] > 1){
		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	}
	j <- j + 1
}


#Scaling the Data
scrawdat <- NuRDATN%>%
	scale()
attr(scrawdat,"scaled:center") <- NULL
attr(scrawdat,"scaled:scale") <- NULL
colnames(scrawdat) <- rownames(tabRDATID)


#Discretized the Data
dialzdat <- scrawdat %>%
	dndat(.) %>%
	t()%>%
	as.data.frame(.)
colnames(dialzdat) <- rownames(RAWDATNUM)

#gene names
genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
#setting "ID_REF" as a new variable
colnames(genena) <- "ID_REF"
rownames(dialzdat) <- NULL
dialzdat <-bind_cols(genena,dialzdat)

#NAs in a column
x <- 2
nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
nacol[1,1] = "COL_NAs"
for(x in 2:dim(dialzdat)[2]){
	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	x <- x + 1
}
colnames(nacol) <- colnames(dialzdat)
dialzdat<-bind_rows(dialzdat,nacol)

#NAs in a row
y <- 1
narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
for(y in 1:dim(dialzdat)[1]){
	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	y <- y + 1
}
colnames(narowd) <- "ROW_NAs"
dialzdat <- bind_cols(dialzdat,narowd)

#converting to character so that the clinical can be brought together with discrete data
k <- 2
for(k in 2:dim(dialzdat)[2]-1){
	dialzdat[,k] <- as.character(dialzdat[,k])
	k <- k + 1
}


#The End the full data
Fullalzdw <- bind_rows(RAWWORD,dialzdat)

#Create the file
nfnaex <- strsplit(rawdat,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.) %>%
	c("GSE",.,"dscrt.txt") %>%
	paste(collapse = "")
write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)