RPostClean.R 3.64 KB
#For Reading Raw Data from the created file

#Required Libraries
library(MASS)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)


#Necessary Functions

#1# Function for discretizing the data 
dndat <- function(NDATA){
	rownd <- dim(NDATA)[1]
	colnd <- dim(NDATA)[2]
	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	colnames(DDATA) <- colnames(NDATA)
	i = 1
	for(i in 1:rownd){
	    j <- 1
		for(j in 1:colnd){
			if(is.na(NDATA[i,j])==FALSE){
			
				if(NDATA[i,j] < -1){
					DDATA[i,j]=0L
				}
				if(NDATA[i,j] > 1){
					DDATA[i,j]=2L
				}
				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
					DDATA[i,j]=1L
				}
			} else{
				DDATA[i,j] = NDATA[i,j]
			}
			j = j + 1			
		}
		i = i + 1
	}
	DDATA
}


#Bringing in the file
rawdat <- file.choose()
RAWDAT <- rawdat %>% 
	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
attributes(RAWDAT)$names <- RAWDAT[1,]

#Just the clinical data
RAWWORD <- rawdat %>%
	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
attributes(RAWWORD)$names <- RAWDAT[1,]
#Add col of NAs to clinical data
z <- 1
naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
for(z in 1:dim(RAWWORD)[1]){
	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	z <- z + 1
}
colnames(naroww) <- "ROW_NAs"
RAWWORD <- bind_cols(RAWWORD,naroww)


##Getting back to the data
RAWDAT2 <- RAWDAT[-1,] %>%
	dplyr::arrange(.,ID_REF)

##Editing the file for R processing
RAWDATID <- RAWDAT2[,1] %>%
	as.matrix(.)
RAWDATNUM <- RAWDAT2[,-1] %>%
	mapply(.,FUN = as.numeric) %>%
	t(.)

##Consolidating genes with the same name
tabRDATID <- table(RAWDATID)
NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
j <- 1
for(j in 1:length(tabRDATID)){
	##Putting the ones without duplicates in their new homes
	if(tabRDATID[j] == 1){
		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	}
	##Averaging duplicates and putting them in their new homes
	if(tabRDATID[j] > 1){
		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	}
	j <- j + 1
}


#Scaling the Data
scrawdat <- NuRDATN%>%
	scale()
attr(scrawdat,"scaled:center") <- NULL
attr(scrawdat,"scaled:scale") <- NULL
colnames(scrawdat) <- rownames(tabRDATID)


#Discretized the Data
dialzdat <- scrawdat %>%
	dndat(.) %>%
	t()%>%
	as.data.frame(.)
colnames(dialzdat) <- rownames(RAWDATNUM)

#gene names
genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
#setting "ID_REF" as a new variable
colnames(genena) <- "ID_REF"
rownames(dialzdat) <- NULL
dialzdat <-bind_cols(genena,dialzdat)

#NAs in a column
x <- 2
nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
nacol[1,1] = "COL_NAs"
for(x in 2:dim(dialzdat)[2]){
	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	x <- x + 1
}
colnames(nacol) <- colnames(dialzdat)
dialzdat<-bind_rows(dialzdat,nacol)

#NAs in a row
y <- 1
narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
for(y in 1:dim(dialzdat)[1]){
	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	y <- y + 1
}
colnames(narowd) <- "ROW_NAs"
dialzdat <- bind_cols(dialzdat,narowd)

#converting to character so that the clinical can be brought together with discrete data
k <- 2
for(k in 2:dim(dialzdat)[2]-1){
	dialzdat[,k] <- as.character(dialzdat[,k])
	k <- k + 1
}


#The End the full data
Fullalzdw <- bind_rows(RAWWORD,dialzdat)

#Create the file
nfnaex <- strsplit(rawdat,"[\\|/]") %>%
	.[[1]] %>%
	.[length(.)] %>%
	gsub("\\D","",.) %>%
	c("GSE",.,"dscrt.txt") %>%
	paste(collapse = "")
write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)