This code takes the clean data and discretizes it

Efrain Gonzalez
1 parent 69cbaf694d
Showing 1 changed file with 157 additions and 0 deletions Show diff stats
RPostClean.R
@@ -0,0 +1,157 @@
+#For Reading Raw Data from the created file
+
+#Required Libraries
+library(MASS)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(stringr)
+
+
+#Necessary Functions
+
+#1# Function for discretizing the data 
+dndat <- function(NDATA){
+	rownd <- dim(NDATA)[1]
+	colnd <- dim(NDATA)[2]
+	DDATA <- matrix(0,nrow=rownd,ncol=colnd)
+	colnames(DDATA) <- colnames(NDATA)
+	i = 1
+	for(i in 1:rownd){
+		for(j in 1:colnd){
+			if(is.na(NDATA[i,j])==FALSE){
+			
+				if(NDATA[i,j] < -1){
+					DDATA[i,j]=0L
+				}
+				if(NDATA[i,j] > 1){
+					DDATA[i,j]=2L
+				}
+				if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
+					DDATA[i,j]=1L
+				}
+			} else{
+				DDATA[i,j] = NDATA[i,j]
+			}
+			j = j + 1			
+		}
+		i = i + 1
+	}
+	DDATA
+}
+
+
+#Bringing in the file
+rawdat <- file.choose()
+RAWDAT <- rawdat %>% 
+	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
+	filter(.,!grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
+attributes(RAWDAT)$names <- RAWDAT[1,]
+
+#Just the clinical data
+RAWWORD <- rawdat %>%
+	read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
+	filter(.,grepl("Group|Age|Region|PMI|Title|Sex|Braak",X1))
+attributes(RAWWORD)$names <- RAWDAT[1,]
+#Add col of NAs to clinical data
+z <- 1
+naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
+for(z in 1:dim(RAWWORD)[1]){
+	naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
+	z <- z + 1
+}
+colnames(naroww) <- "ROW_NAs"
+RAWWORD <- bind_cols(RAWWORD,naroww)
+
+
+##Getting back to the data
+RAWDAT2 <- RAWDAT[-1,] %>%
+	dplyr::arrange(.,ID_REF)
+
+##Editing the file for R processing
+RAWDATID <- RAWDAT2[,1] %>%
+	as.matrix(.)
+RAWDATNUM <- RAWDAT2[,-1] %>%
+	mapply(.,FUN = as.numeric) %>%
+	t(.)
+
+##Consolidating genes with the same name
+tabRDATID <- table(RAWDATID)
+NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
+j <- 1
+for(j in 1:length(tabRDATID)){
+	##Putting the ones without duplicates in their new homes
+	if(tabRDATID[j] == 1){
+		NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
+	}
+	##Averaging duplicates and putting them in their new homes
+	if(tabRDATID[j] > 1){
+		NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
+	}
+	j <- j + 1
+}
+
+
+#Scaling the Data
+scrawdat <- NuRDATN%>%
+	scale()
+attr(scrawdat,"scaled:center") <- NULL
+attr(scrawdat,"scaled:scale") <- NULL
+colnames(scrawdat) <- rownames(tabRDATID)
+
+
+#Discretized the Data
+dialzdat <- scrawdat %>%
+	dndat(.) %>%
+	t()%>%
+	as.data.frame(.)
+colnames(dialzdat) <- rownames(RAWDATNUM)
+
+#gene names
+genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
+#setting "ID_REF" as a new variable
+colnames(genena) <- "ID_REF"
+rownames(dialzdat) <- NULL
+dialzdat <-bind_cols(genena,dialzdat)
+
+#NAs in a column
+x <- 2
+nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
+nacol[1,1] = "COL_NAs"
+for(x in 2:dim(dialzdat)[2]){
+	nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
+	x <- x + 1
+}
+colnames(nacol) <- colnames(dialzdat)
+dialzdat<-bind_rows(dialzdat,nacol)
+
+#NAs in a row
+y <- 1
+narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
+for(y in 1:dim(dialzdat)[1]){
+	narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
+	y <- y + 1
+}
+colnames(narowd) <- "ROW_NAs"
+dialzdat <- bind_cols(dialzdat,narowd)
+
+#converting to character so that the clinical can be brought together with discrete data
+k <- 2
+for(k in 2:dim(dialzdat)[2]-1){
+	dialzdat[,k] <- as.character(dialzdat[,k])
+	k <- k + 1
+}
+
+
+#The End the full data we seem to have found Carmen
+Fullalzdw <- bind_rows(RAWWORD,dialzdat)
+
+#Create the file
+nfnaex <- strsplit(rawdat,"[\\|/]") %>%
+	.[[1]] %>%
+	.[length(.)] %>%
+	gsub("\\D","",.) %>%
+	c("GSE",.,"dscrt.txt") %>%
+	paste(collapse = "")
+write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
+
...	...	@@ -0,0 +1,157 @@
	1	+#For Reading Raw Data from the created file
	2	+
	3	+#Required Libraries
	4	+library(MASS)
	5	+library(dplyr)
	6	+library(tidyr)
	7	+library(readr)
	8	+library(stringr)
	9	+
	10	+
	11	+#Necessary Functions
	12	+
	13	+#1# Function for discretizing the data
	14	+dndat <- function(NDATA){
	15	+ rownd <- dim(NDATA)[1]
	16	+ colnd <- dim(NDATA)[2]
	17	+ DDATA <- matrix(0,nrow=rownd,ncol=colnd)
	18	+ colnames(DDATA) <- colnames(NDATA)
	19	+ i = 1
	20	+ for(i in 1:rownd){
	21	+ for(j in 1:colnd){
	22	+ if(is.na(NDATA[i,j])==FALSE){
	23	+
	24	+ if(NDATA[i,j] < -1){
	25	+ DDATA[i,j]=0L
	26	+ }
	27	+ if(NDATA[i,j] > 1){
	28	+ DDATA[i,j]=2L
	29	+ }
	30	+ if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){
	31	+ DDATA[i,j]=1L
	32	+ }
	33	+ } else{
	34	+ DDATA[i,j] = NDATA[i,j]
	35	+ }
	36	+ j = j + 1
	37	+ }
	38	+ i = i + 1
	39	+ }
	40	+ DDATA
	41	+}
	42	+
	43	+
	44	+#Bringing in the file
	45	+rawdat <- file.choose()
	46	+RAWDAT <- rawdat %>%
	47	+ read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	48	+ filter(.,!grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))
	49	+attributes(RAWDAT)$names <- RAWDAT[1,]
	50	+
	51	+#Just the clinical data
	52	+RAWWORD <- rawdat %>%
	53	+ read_delim(delim ="\t",col_names = FALSE,skip=1) %>%
	54	+ filter(.,grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1))
	55	+attributes(RAWWORD)$names <- RAWDAT[1,]
	56	+#Add col of NAs to clinical data
	57	+z <- 1
	58	+naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE)
	59	+for(z in 1:dim(RAWWORD)[1]){
	60	+ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,])))
	61	+ z <- z + 1
	62	+}
	63	+colnames(naroww) <- "ROW_NAs"
	64	+RAWWORD <- bind_cols(RAWWORD,naroww)
	65	+
	66	+
	67	+##Getting back to the data
	68	+RAWDAT2 <- RAWDAT[-1,] %>%
	69	+ dplyr::arrange(.,ID_REF)
	70	+
	71	+##Editing the file for R processing
	72	+RAWDATID <- RAWDAT2[,1] %>%
	73	+ as.matrix(.)
	74	+RAWDATNUM <- RAWDAT2[,-1] %>%
	75	+ mapply(.,FUN = as.numeric) %>%
	76	+ t(.)
	77	+
	78	+##Consolidating genes with the same name
	79	+tabRDATID <- table(RAWDATID)
	80	+NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID))
	81	+j <- 1
	82	+for(j in 1:length(tabRDATID)){
	83	+ ##Putting the ones without duplicates in their new homes
	84	+ if(tabRDATID[j] == 1){
	85	+ NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])]
	86	+ }
	87	+ ##Averaging duplicates and putting them in their new homes
	88	+ if(tabRDATID[j] > 1){
	89	+ NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE)
	90	+ }
	91	+ j <- j + 1
	92	+}
	93	+
	94	+
	95	+#Scaling the Data
	96	+scrawdat <- NuRDATN%>%
	97	+ scale()
	98	+attr(scrawdat,"scaled:center") <- NULL
	99	+attr(scrawdat,"scaled:scale") <- NULL
	100	+colnames(scrawdat) <- rownames(tabRDATID)
	101	+
	102	+
	103	+#Discretized the Data
	104	+dialzdat <- scrawdat %>%
	105	+ dndat(.) %>%
	106	+ t()%>%
	107	+ as.data.frame(.)
	108	+colnames(dialzdat) <- rownames(RAWDATNUM)
	109	+
	110	+#gene names
	111	+genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1))
	112	+#setting "ID_REF" as a new variable
	113	+colnames(genena) <- "ID_REF"
	114	+rownames(dialzdat) <- NULL
	115	+dialzdat <-bind_cols(genena,dialzdat)
	116	+
	117	+#NAs in a column
	118	+x <- 2
	119	+nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE)
	120	+nacol[1,1] = "COL_NAs"
	121	+for(x in 2:dim(dialzdat)[2]){
	122	+ nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x])))
	123	+ x <- x + 1
	124	+}
	125	+colnames(nacol) <- colnames(dialzdat)
	126	+dialzdat<-bind_rows(dialzdat,nacol)
	127	+
	128	+#NAs in a row
	129	+y <- 1
	130	+narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE)
	131	+for(y in 1:dim(dialzdat)[1]){
	132	+ narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,])))
	133	+ y <- y + 1
	134	+}
	135	+colnames(narowd) <- "ROW_NAs"
	136	+dialzdat <- bind_cols(dialzdat,narowd)
	137	+
	138	+#converting to character so that the clinical can be brought together with discrete data
	139	+k <- 2
	140	+for(k in 2:dim(dialzdat)[2]-1){
	141	+ dialzdat[,k] <- as.character(dialzdat[,k])
	142	+ k <- k + 1
	143	+}
	144	+
	145	+
	146	+#The End the full data we seem to have found Carmen
	147	+Fullalzdw <- bind_rows(RAWWORD,dialzdat)
	148	+
	149	+#Create the file
	150	+nfnaex <- strsplit(rawdat,"[\\\|/]") %>%
	151	+ .[[1]] %>%
	152	+ .[length(.)] %>%
	153	+ gsub("\\D","",.) %>%
	154	+ c("GSE",.,"dscrt.txt") %>%
	155	+ paste(collapse = "")
	156	+write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)
	157	+