Efrain Gonzalez / Cleaning and Fixing Data with R

Blame view

RPostClean.R 3.64 KB

788834dd7 Efrain Gonzalez This code takes t...	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20	#For Reading Raw Data from the created file #Required Libraries library(MASS) library(dplyr) library(tidyr) library(readr) library(stringr) #Necessary Functions #1# Function for discretizing the data dndat <- function(NDATA){ rownd <- dim(NDATA)[1] colnd <- dim(NDATA)[2] DDATA <- matrix(0,nrow=rownd,ncol=colnd) colnames(DDATA) <- colnames(NDATA) i = 1 for(i in 1:rownd){
2167ed763 Efrain Gonzalez Update	21	j <- 1
788834dd7 Efrain Gonzalez This code takes t...	22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144	for(j in 1:colnd){ if(is.na(NDATA[i,j])==FALSE){ if(NDATA[i,j] < -1){ DDATA[i,j]=0L } if(NDATA[i,j] > 1){ DDATA[i,j]=2L } if(-1 <= NDATA[i,j] && NDATA[i,j] < 1){ DDATA[i,j]=1L } } else{ DDATA[i,j] = NDATA[i,j] } j = j + 1 } i = i + 1 } DDATA } #Bringing in the file rawdat <- file.choose() RAWDAT <- rawdat %>% read_delim(delim ="\t",col_names = FALSE,skip=1) %>% filter(.,!grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1)) attributes(RAWDAT)$names <- RAWDAT[1,] #Just the clinical data RAWWORD <- rawdat %>% read_delim(delim ="\t",col_names = FALSE,skip=1) %>% filter(.,grepl("Group\|Age\|Region\|PMI\|Title\|Sex\|Braak",X1)) attributes(RAWWORD)$names <- RAWDAT[1,] #Add col of NAs to clinical data z <- 1 naroww <- as.data.frame(rep(0,dim(RAWWORD)[1]),stringsAsFactors = FALSE) for(z in 1:dim(RAWWORD)[1]){ naroww[z,1] <- as.integer(sum(is.na(RAWWORD[z,]))) z <- z + 1 } colnames(naroww) <- "ROW_NAs" RAWWORD <- bind_cols(RAWWORD,naroww) ##Getting back to the data RAWDAT2 <- RAWDAT[-1,] %>% dplyr::arrange(.,ID_REF) ##Editing the file for R processing RAWDATID <- RAWDAT2[,1] %>% as.matrix(.) RAWDATNUM <- RAWDAT2[,-1] %>% mapply(.,FUN = as.numeric) %>% t(.) ##Consolidating genes with the same name tabRDATID <- table(RAWDATID) NuRDATN <- matrix(0, nrow = dim(RAWDATNUM)[1], ncol = length(tabRDATID)) j <- 1 for(j in 1:length(tabRDATID)){ ##Putting the ones without duplicates in their new homes if(tabRDATID[j] == 1){ NuRDATN[,j] <- RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])] } ##Averaging duplicates and putting them in their new homes if(tabRDATID[j] > 1){ NuRDATN[,j] <- rowMeans(RAWDATNUM[,which(RAWDATID==rownames(tabRDATID)[j])],na.rm = TRUE) } j <- j + 1 } #Scaling the Data scrawdat <- NuRDATN%>% scale() attr(scrawdat,"scaled:center") <- NULL attr(scrawdat,"scaled:scale") <- NULL colnames(scrawdat) <- rownames(tabRDATID) #Discretized the Data dialzdat <- scrawdat %>% dndat(.) %>% t()%>% as.data.frame(.) colnames(dialzdat) <- rownames(RAWDATNUM) #gene names genena <- as.data.frame(as.matrix(rownames(dialzdat),ncol=1)) #setting "ID_REF" as a new variable colnames(genena) <- "ID_REF" rownames(dialzdat) <- NULL dialzdat <-bind_cols(genena,dialzdat) #NAs in a column x <- 2 nacol <- as.data.frame(t(rep(0,dim(dialzdat)[2])),stringsAsFactors = FALSE) nacol[1,1] = "COL_NAs" for(x in 2:dim(dialzdat)[2]){ nacol[1,x] <- as.integer(sum(is.na(dialzdat[,x]))) x <- x + 1 } colnames(nacol) <- colnames(dialzdat) dialzdat<-bind_rows(dialzdat,nacol) #NAs in a row y <- 1 narowd <- as.data.frame(rep(0,dim(dialzdat)[1]),stringsAsFactors = FALSE) for(y in 1:dim(dialzdat)[1]){ narowd[y,1] <- as.integer(sum(is.na(dialzdat[y,]))) y <- y + 1 } colnames(narowd) <- "ROW_NAs" dialzdat <- bind_cols(dialzdat,narowd) #converting to character so that the clinical can be brought together with discrete data k <- 2 for(k in 2:dim(dialzdat)[2]-1){ dialzdat[,k] <- as.character(dialzdat[,k]) k <- k + 1 }
8bfefd7af Efrain Gonzalez Update	145	#The End the full data
788834dd7 Efrain Gonzalez This code takes t...	146 147 148 149 150 151 152 153 154 155	Fullalzdw <- bind_rows(RAWWORD,dialzdat) #Create the file nfnaex <- strsplit(rawdat,"[\\\|/]") %>% .[[1]] %>% .[length(.)] %>% gsub("\\D","",.) %>% c("GSE",.,"dscrt.txt") %>% paste(collapse = "") write.table(Fullalzdw, file = nfnaex, sep = "\t",col.names = TRUE,row.names = FALSE)