############# Class Project PGM   GSE31430########################################################

#### Part One Limma

### Set working directory

directory <-(getwd())
directory
##### References:
###https://www.stat.purdue.edu/bigtap/online/docs/Introduction_to_Microarray_Analysis_GSE15947.html
####https://sbc.shef.ac.uk/geo_tutorial/tutorial.nb.html
### Datacamp notes
#####https://www.r-bloggers.com/2012/01/annotating-limma-results-with-gene-names-for-affy-microarrays/
#########################################################################################################################
###libraries needed to run project
library(GEOquery)
library(affy)
library(limma)
library(RColorBrewer)
library(dplyr)
library(ggplot2)
library(annotate)
library(R2HTML)
library (hgu133plus2.db)
library(GO.db)
library(org.Rn.eg.db)
library(Rattus.norvegicus)
library(rat2302.db)



#### Part One General Microarray  DAG

######### Step 1 Download Data

##Get GeoProfiles
 my.gse <- "GSE31430"

##create directory for files downloaded from GEO
if(!file.exists("geo_downloads")) dir.create("geo_downloads")
if(!file.exists("results"))  dir.create("results", recursive=TRUE)
##get data from GEO
my.geo.gse <- getGEO(GEO=my.gse, filename=NULL, destdir="./geo_downloads", GSElimits=NULL, GSEMatrix=TRUE, AnnotGPL=FALSE, getGPL=FALSE)

my.geo.gse
##explore structure of the data downloaded
class(my.geo.gse)

length(my.geo.gse)

names(my.geo.gse)

#get rid of list structure
  
my.geo.gse <- my.geo.gse[[1]]

##object is now an ExpressionSet
class(my.geo.gse)

#### Structure of Data
str(my.geo.gse)

###Column Names
colnames(pData(my.geo.gse))
head(exprs(my.geo.gse))

###summary of data
summary(exprs(my.geo.gse))

rownames(exprs(my.geo.gse))

cd <- (exprs(my.geo.gse))

head (cd)
### Save data

write.csv(cd , file = 'datas-cleaned1.csv')

###################################################################################################################
require("biomaRt")

ensembl <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset("rnorvegicus_gene_ensembl", mart)

annotLookup <- getBM(
  mart=mart,
  attributes=c(
    "ensembl_gene_id",
    "gene_biotype",
    "external_gene_name"),
  filter = 'external_gene_name',
values = rownames(exprs(my.geo.gse)), uniqueRows=TRUE)


annotLookup
head(annotLookup)



listMarts()
ensembl <- useMart("ensembl")
datasets <- listDatasets(ensembl)
ensembl = useDataset("rnorvegicus_gene_ensembl",mart=ensembl)
annot <- getBM(attributes=c('external_gene_name','ensembl_gene_id')
               , mart = ensembl)

########################################################################################
#################################################################################################################################################################################################################################################################################################################################################
# The above sequence gives  metadata, phenodata (sample data), and experimental data (microarray intensities) for this experiment. The expression data is in format that could be used for differential gene expression (DGE) analysis. However, it is generally a good idea to start with the raw data and process it yourself. 
 #use RMA (Robust Multiarray Average) instead. Currently, it is the most common method to determine probeset expression level for Affymetrix arrays. Another potential reason to reprocess published data is so that you can combine data from different projects.
#################################################################################################################################################################################################################################################

###  RMA (Robust Multiarray Average) Method
if(!file.exists(paste0("./geo_downloads/",my.gse)))
  getGEOSuppFiles(my.gse, makeDirectory=T, baseDir="geo_downloads")

##list files in working directory
list.files("geo_downloads")
list.files(paste0("geo_downloads/",my.gse))

##function creates a directory and downloads the "tarball" that contains
##gzipped CEL files. We need to untar the tarball but we do not need to unzip
##the CEL files. R can read them in compressed format.
untar(paste0("geo_downloads/",my.gse,"/",my.gse,"_RAW.tar"), exdir=paste0("geo_downloads/",my.gse,"/CEL"))
list.files(paste0("geo_downloads/",my.gse,"/CEL"))

my.cels <- list.files(paste0("geo_downloads/",my.gse,"/CEL"), pattern=".CEL")
my.cels

my.cels <- sort(my.cels)
my.cels
head(my.cels)
tail(my.cels)
class(my.cels)
### Rename column 4 has error (extra 2)
names(my.cels)[names(my.cels) == 'GSM780659_PF6__Rat230_2__2.CEL.gz'] <- 'GSM780659_PF6__Rat230_2__.CEL.gz'

tail(my.cels)

##### didn't work download file to fix name and re-upload
x <- as.data.frame(my.cels)

### To excel
write.csv(x , file = 'datas-cleaned11.csv')

#### Bring Back into R
my.cels <- datas_cleaned11 
summary(my.cels)
class(my.cels)
my.cels
################################################################################################################################################################################################################################
########### Processing Microarray Data
####################################################################################################################
######Now that we have the CEL files and associated metadata, we can reprocess these microarrays and analyze the results with our preferred methods.
##make data frame of phenoData
my.pdata <- as.data.frame(pData(my.geo.gse), stringsAsFactors=F)
head(my.pdata)
colnames(my.pdata)

my.pdata <- my.pdata[, c("geo_accession","treatment:ch1" )]
my.pdata <- my.pdata[order(rownames(my.pdata)), ]
my.pdata <- my.pdata[, c("title", "geo_accession","treatment:ch1")]
my.pdata <- my.pdata[order(rownames(my.pdata)), ]
head(my.pdata, 10)
my.pdata

##the rownames of the data frame must be the same as the CEL file names.
##For this data, we need to do a bit of tinkering.
rownames(my.pdata) == my.cels
table(rownames(my.pdata) == my.cels)
head(my.cels)
head(rownames(my.pdata))
temp.rownames <- paste(rownames(my.pdata), ".CEL.gz", sep="")
table(temp.rownames == my.cels)
rownames(my.pdata) <- temp.rownames
rm(temp.rownames)
table(rownames(my.pdata) == my.cels)
###### Check to see if everything now matches
my.pdata
######

write.table(my.pdata, file=paste0("geo_downloads/",my.gse,"/CEL/",my.gse,"_SelectPhenoData.txt"), sep="\t", quote=F)
################################################################################################################
## CEL files and a corresponding data frame with the phenoData, we can read the CEL files into R with the ReadAffy function.
###This formats the data as an AffyBatch object that includes the experimental and phenodata.
################################################################################################################################
##Perform affy normalization

cel.path <- paste0("geo_downloads/",my.gse,"/CEL")
my.affy <- ReadAffy(celfile.path=cel.path, phenoData=paste(cel.path, paste0(my.gse,"_SelectPhenoData.txt"), sep="/"))
show(my.affy)

head(exprs(my.affy))
pData(my.affy)

##create shorter descriptive levels and labels
pData(my.affy)$sample.levels <- c(rep("PF", 4), rep("ZD_10_days", 4), rep("ZR_PF", 4), rep("ZR", 4))
pData(my.affy)$sample.labels <- c(paste("PF", 1:4, sep="."), paste("ZD_10_days", 1:4, sep="."), paste("ZR_PF", 1:4, sep="."), paste("ZR", 1:4, sep="."))
table(pData(my.affy)$sample.levels)

table(pData(my.affy)$treatment.ch1, pData(my.affy)$sample.levels)
###################################################################################################################################################
#### Calculate Gene Expression
##Calculate gene level expression measures
my.rma <- rma(my.affy, normalize=F, background=F)
head(exprs(my.rma))
dim(exprs(my.rma))

pData(my.rma)

#### Plot Densities
plotDensity(exprs(my.rma))

##Boxplot is another way of showing the same thing. Add color
level.pal <- brewer.pal(6, "Dark2")
level.pal

boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Not Normalized")
###Save Data
pdf(file="results/DensityNoNorm.pdf", w=6, h=6)
boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Not Normalized")
dev.off()

######Normalize Arrays
##Now use normalization and background correction
my.rma <- rma(my.affy, normalize=T, background=T)
boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Normalized")
### Save Results
pdf(file="results/DensityNorm.pdf", w=6, h=6)
boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Normalized")
dev.off()

##save rma values
write.table(exprs(my.rma), file=paste0("results/",my.gse,"_RMA_Norm.txt"), sep="\t", quote=FALSE)

####Turn Normalized Data into Dataframe
my.data1 <- as.data.frame(pData(my.rma), stringsAsFactors=F)
my.data1
########################################################################################################
####QC
### Investigate
plotMDS(exprs(my.rma), labels=pData(my.rma)$sample.labels, top=500, gene.selection="common", main="MDS Plot to Compare Replicates")
##Save
pdf(file="results/MDS_plot.pdf", w=6, h=6)
plotMDS(exprs(my.rma), labels=pData(my.rma)$sample.labels, top=500, gene.selection="common", main="MDS Plot to Compare Replicates")
dev.off()

#####We can visualize the correlations between the arrays by hierarchical clustering. 
####First, we need to scale the rma values so that highly-expressed gene do not dominate the result. To do this, we are going to produce a matrix of gene-specific z-scores from the rma values. To do this, you must calculate the mean expression and standard deviation for each gene across all samples. The z-score is calculated with the following formula:
###z-score = (rma-sample - rma-mean) / rma-sd
###Thus, the z-score indicates how many standard deviations an observed rma value is above or 
##below the mean.

cluster.dat <- exprs(my.rma)
gene.mean <- apply(cluster.dat, 1, mean)
gene.sd <- apply(cluster.dat, 1, sd)
cluster.dat <- sweep(cluster.dat, 1, gene.mean, "-")
cluster.dat <- sweep(cluster.dat, 1, gene.sd, "/")
my.dist <- dist(t(cluster.dat), method="euclidean")
my.hclust <- hclust(my.dist, method="average")
my.hclust$labels <- pData(my.rma)$sample.labels
plot(my.hclust, cex=0.75, main="Comparison of Biological Replicates", xlab="Euclidean Distance")
###Save
pdf(file="results/Cluster_plot.pdf")
plot(my.hclust, cex=0.75, main="Comparison of Biological Replicates", xlab="Pearson Distance")
dev.off()

##########################################################################################################
###Specify Design
## Create a single variable
   
 group <- with(pData(my.rma),sample.levels, pData(my.rma))
 group <- factor(group)

# Create design matrix with no intercept
my.design <- model.matrix(~0 + group)
colnames(my.design) <- levels (group)
colSums(my.design)

my.design

#### Fitting the Coefficients#####
##determine the average effect (coefficient) for each treatment
my.fit <- lmFit(my.rma, my.design)
my.fit

write.table(my.fit$coefficients, file=paste0("results/",my.gse,"_Limma_Coeff.txt"), sep="\t", quote=F)
 
##specify the contrast of interest using the levels from the design matrix (not significant interaction)
my.contrasts <- makeContrasts(PFvsZD = PF - ZD_10_days, ZRvsZRPF = ZR_PF - ZR,
                              ZR_PFvsPF = ZR_PF - PF, ZRvsZD10days = ZR - ZD_10_days, 
                               levels = my.design)
my.contrasts


###Statistics

# Fit the model from above
my.fit <- lmFit(my.rma, my.design)
my.fit

# Fit the contrasts
fit2 <- contrasts.fit(my.fit, contrasts = my.contrasts)
fit2

# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
fit2

# decideTests will make calls for DEGs by adjusting the p-values and applying a logFC cutoff similar to topTable
# Summarize results (Strictest interpretation)
 results <- decideTests(fit2,adjust.method="BH", p.value=0.05,lfc=1)
 results
 summary(results)
 head(results)
 ?decideTests
 write.csv(results , file = 'datas-cleaned23.csv')
 ######################################################################################################################################

##Plot Results
 # Create a Venn diagram
 vennDiagram(results)
 
 # Obtain the summary statistics for the contrast ZRvsZD10days & ZRPFvsPF
 stats_ZRvsZD10days <- topTable(fit2, coef = "ZRvsZD10days", number = nrow(fit2),                                                          
                            sort.by="B", resort.by="logFC")
 stats_ZRvsZD10days
 
 
 print (stats_ZRvsZD10days)
 
 
 write.csv(stats_ZRvsZD10days , file = 'datas-cleaned24.csv')
 
stats_ZRPFvsPF <- topTable(fit2, coef = "ZR_PFvsPF", number = nrow(fit2),                                                          
                                                   sort.by="B", resort.by="logFC")

print (stats_ZRPFvsPF)

####Create histograms of the p-values for each contrast
 
hist(stats_ZRPFvsPF[, "P.Value"], col=level.pal, main= "Contrast between Pair Feed  vs. Zinc Recovered Pair Feed Rats",
xlab="P-Values")

hist(stats_ZRvsZD10days[, "P.Value"],col=level.pal, main= "Contrast between Zinc Deficient vs. Zinc Recovered Rats", xlab="P-Values")

##### Volcano Plot##############

# Create a volcano plot for the contrast ZR_PFvsPF
volcanoplot(fit2, coef="ZR_PFvsPF",highlight = 10)

####Create a volcano plot for the contrast ZRvsZD10days
 volcanoplot(fit2, coef = "ZRvsZD10days",highlight = 10)
 
###################################################################################################################################
#############################################################################################################################################################
##############################################################################################################################################################
#### BN Learn Bayesian Graph
library(dplyr)
library(ggplot2)
library(bnlearn)
library(gRain)
 
#####################################################################################################################
###Step :3 Discredited Gene Expression & Boot
#####################################################################################################################
 
 ###decideTests will make calls for DEGs by adjusting the p-values and applying a logFC cutoff similar to topTable
 # Summarize results (Strictest interpretation) (data listed as discretized)
 results <- decideTests(fit2,adjust.method="BH", p.value=0.05,lfc=1)
 results
 summary(results)
 head(results)
 ?decideTests
 write.csv(results , file = 'datas-cleaned23.csv')
   
###Bring Back In 
#### Keep top genes that are different
bn5 <- bn2 %>% mutate_if(is.character,as.factor)
class(bn5)
print(bn5)
str(bn5)

###convert character text to factor so it can be read

bn6 <- bn5 %>% mutate_if(is.character,as.factor)
str(bn6)

#####COnvert from tibble to data frame so the program can read it

bn7 <- as.data.frame(bn6)
bn7


######################################################################################################################
#######Step 4: Model Network
######################################################################################################################
### Create  network by listing out your nodes
dag6 <- random.graph(nodes = c("Nupr1l1","Scaf1",
                            "Atp1a3","Cirbp","Gjb2","Pik3ca",
                            "1376534_at","1378107_at","1379132_at",
                            "Dzip3","1380027_at","Fam193a",
                            "1381099_at","Arid3a","Map1a"))

dag7 <- random.graph(nodes = c("Nupr1l1","Scaf1",
                               "Atp1a3","Cirbp","Gjb2","Pik3ca",
                               "1376534_at","1378107_at","1379132_at",
                               "Dzip3","1380027_at","Fam193a",
                               "1381099_at","Arid3a","Map1a"))


################## Plots
dag5 <- model2network("[Dzip3][Nupr1l1|Dzip3][Scaf1|Dzip3][Atp1a3|Dzip3][Cirbp|Dzip3][Gjb2|Dzip3][Pik3ca|Dzip3][1376534_at|Dzip3][1378107_at|Dzip3][1379132_at|Dzip3][1380027_at|Dzip3][Fam193a|Dzip3][1381099_at|Dzip3][Arid3a|Dzip3][Map1a|Dzip3]")

graphviz.plot(dag5, layout = "fdp", shape = "ellipse" )

graphviz.plot(dag6, layout = "fdp", shape = "ellipse" )

graphviz.plot(dag7, layout = "fdp", shape = "ellipse" )

                           


####################################################################################################################################
 ### Step 5: Create call equations
###################################################################################################################################
###Dag5
net1 <-  bn.fit(dag5, data =bn7, method = "bayes", iss=1)
net1
net2 <-  bn.fit(dag5, data =bn7, method = "mle")
net2
net3 <-  bn.fit(dag5, data =bn7, method = "bayes", iss=10)
net3

###Dag6
net6 <-  bn.fit(dag6, data =bn7, method = "bayes", iss=1)
net6
net7 <-  bn.fit(dag6, data =bn7, method = "mle")
net7
net8 <-  bn.fit(dag6, data =bn7, method = "bayes", iss=10)
net8

###Dag7

net6 <-  bn.fit(dag7, data =bn7, method = "bayes", iss=1)
net6
net7 <-  bn.fit(dag7, data =bn7, method = "mle")
net7
net8 <-  bn.fit(dag7, data =bn7, method = "bayes", iss=10)
net8


####################################################################################################################################
#### Step 6 :  Arc. Strength and Network Score
######################################################################################################################################
arc.strength(dag5, data = bn7, criterion = "bic")
arc.strength(dag5, data = bn7, criterion = "bde")
arc.strength(dag5, data = bn7, criterion = "x2")

arc.strength(dag6, data = bn7, criterion = "bic")
arc.strength(dag6, data = bn7, criterion = "bde")
arc.strength(dag6, data = bn7, criterion = "x2")

arc.strength(dag7, data = bn7, criterion = "bic")
arc.strength(dag7, data = bn7, criterion = "bde")
arc.strength(dag7, data = bn7, criterion = "x2")


score(dag5,data= bn7, type="bic")
score(dag6,data= bn7, type="bic")
score(dag7,data= bn7, type="bic")
######################################################################################################################################
### Step 7 :  Extract Exact Inference of the Best Network
#######################################################################################################################################
#### Rename columns in DAG 5 w/ _ so program will run and in data

net1 <-  bn.fit(dag5, data =bn7, method = "bayes", iss=1)
net1

dag10 <- model2network("[Dzip3][Nupr1l1|Dzip3][Scaf1|Dzip3][Atp1a3|Dzip3][Cirbp|Dzip3][Gjb2|Dzip3][Pik3ca|Dzip3][1376534|Dzip3][1378107|Dzip3][1379132|Dzip3][1380027|Dzip3][Fam193a|Dzip3][1381099|Dzip3][Arid3a|Dzip3][Map1a|Dzip3]")

graphviz.plot(dag10, layout = "fdp", shape = "ellipse" )


### Bring back in renamed Columns
bn9 <- bn2 %>% mutate_if(is.character,as.factor)
class(bn9)
print(bn9)
str(bn9)

###convert character text to factor so it can be read

bn10 <- bn9 %>% mutate_if(is.character,as.factor)
str(bn10)

#####Convert from tibble to data frame so the program can read it

bn11 <- as.data.frame(bn10)
bn11

### Create Call Equation
net10 <-  bn.fit(dag10, data = bn11, method = "bayes", iss=1)
net10
         
### Create junction tree
junction<- compile(as.grain(net10))

### Query Network equation
querygrain(junction, nodes = "Dzip3")$Dzip3

### Query Network with known gene related to Mtor
Dzip31<- setEvidence(junction, nodes= "Dzip3", states ="Up")
### Query Network with known gene related to Mtor
Dzip32<- setEvidence(junction, nodes= "Dzip3", states ="Down")
### Pik3ca
querygrain(Dzip31, nodes = "Pik3ca")$Pik3ca
querygrain(Dzip32, nodes = "Pik3ca")$Pik3ca

### Find Marginal, Joint and Conditional Probability & life expectancy
ap1.cpt <- querygrain(Dzip31, nodes= c("Pik3ca","Atp1a3"), type=
                        "marginal")
ap1.cpt

ap2.cpt <- querygrain(Dzip31, nodes= c("Pik3ca","Atp1a3"), type=
                        "joint")
ap2.cpt

ap3.cpt <- querygrain(Dzip31, nodes= c("Pik3ca","Atp1a3"), type=
                        "conditional")
ap3.cpt

######################################################################################################################################
### Step 8 :  Extract Approximate Inference
#######################################################################################################################################
### Forward sampling
### Two genes
set.seed(123)
cpquery(net10, (Dzip3=="Up"), (Pik3ca=="Up"))
cpquery(net10, (Dzip3=="Up"), (Pik3ca=="Notsignficant"))
cpquery(net10, (Dzip3=="Notsignficant"), (Pik3ca=="Up"))


#### Three genes
cpquery(net10, (Dzip3=="Up"), (Pik3ca=="Up" & Atp1a3=="Up"))
cpquery(net10, (Dzip3=="Up"), (Pik3ca=="Notsignficant" & Atp1a3=="Up"))
cpquery(net10, (Dzip3=="Up"), (Pik3ca=="Up" & Atp1a3=="Notsignficant"))


#### Gibbs sampling
### Two Genes
cpquery(net10, method="lw", event = (Dzip3=="Up"), evidence=list(Pik3ca="Up"))
cpquery(net10, method="lw", event = (Dzip3=="Up"), evidence=list(Pik3ca="Notsignficant"))
cpquery(net10, method="lw", event = (Dzip3=="Notsignficant"), evidence=list(Pik3ca="Up"))

### Three Genes
cpquery(net10, method="lw", event = (Dzip3=="Up"), evidence=list(Pik3ca="Up" , Atp1a3="Up"))
cpquery(net10, method="lw", event = (Dzip3=="Up"), evidence=list(Pik3ca="Notsignficant" , Atp1a3="Up"))
cpquery(net10, method="lw", event = (Dzip3=="Up"), evidence=list(Pik3ca="Notsignficant" , Atp1a3="Notsignficant"))


##################################################################################################################################################