############# Class Project PGM   GSE31430########################################################

#### Part One Limma

### Set working directory

directory <-(getwd())
directory
##### References:
###https://www.stat.purdue.edu/bigtap/online/docs/Introduction_to_Microarray_Analysis_GSE15947.html
####https://sbc.shef.ac.uk/geo_tutorial/tutorial.nb.html
### Datacamp notes
#####https://www.r-bloggers.com/2012/01/annotating-limma-results-with-gene-names-for-affy-microarrays/
#########################################################################################################################
###libraries needed to run project
library(GEOquery)
library(affy)
library(limma)
library(RColorBrewer)
library(dplyr)
library(ggplot2)
library(annotate)
library(R2HTML)
library (hgu133plus2.db)
library(GO.db)
library(org.Rn.eg.db)
library(Rattus.norvegicus)
library(rat2302.db)



#### Part One General Microarray  DAG

######### Step 1 Download Data

##Get GeoProfiles
 my.gse <- "GSE31430"

##create directory for files downloaded from GEO
if(!file.exists("geo_downloads")) dir.create("geo_downloads")
if(!file.exists("results"))  dir.create("results", recursive=TRUE)
##get data from GEO
my.geo.gse <- getGEO(GEO=my.gse, filename=NULL, destdir="./geo_downloads", GSElimits=NULL, GSEMatrix=TRUE, AnnotGPL=FALSE, getGPL=FALSE)

my.geo.gse
##explore structure of the data downloaded
class(my.geo.gse)

length(my.geo.gse)

names(my.geo.gse)

#get rid of list structure
  
my.geo.gse <- my.geo.gse[[1]]

##object is now an ExpressionSet
class(my.geo.gse)

#### Structure of Data
str(my.geo.gse)

###Column Names
colnames(pData(my.geo.gse))
head(exprs(my.geo.gse))

###summary of data
summary(exprs(my.geo.gse))

rownames(exprs(my.geo.gse))

cd <- (exprs(my.geo.gse))

head (cd)
### Save data

write.csv(cd , file = 'datas-cleaned1.csv')

###################################################################################################################
require("biomaRt")

ensembl <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset("rnorvegicus_gene_ensembl", mart)

annotLookup <- getBM(
  mart=mart,
  attributes=c(
    "ensembl_gene_id",
    "gene_biotype",
    "external_gene_name"),
  filter = 'external_gene_name',
values = rownames(exprs(my.geo.gse)), uniqueRows=TRUE)


annotLookup
head(annotLookup)



listMarts()
ensembl <- useMart("ensembl")
datasets <- listDatasets(ensembl)
ensembl = useDataset("rnorvegicus_gene_ensembl",mart=ensembl)
annot <- getBM(attributes=c('external_gene_name','ensembl_gene_id')
               , mart = ensembl)

########################################################################################
#################################################################################################################################################################################################################################################################################################################################################
# The above sequence gives  metadata, phenodata (sample data), and experimental data (microarray intensities) for this experiment. The expression data is in format that could be used for differential gene expression (DGE) analysis. However, it is generally a good idea to start with the raw data and process it yourself. 
 #use RMA (Robust Multiarray Average) instead. Currently, it is the most common method to determine probeset expression level for Affymetrix arrays. Another potential reason to reprocess published data is so that you can combine data from different projects.
#################################################################################################################################################################################################################################################

###  RMA (Robust Multiarray Average) Method
if(!file.exists(paste0("./geo_downloads/",my.gse)))
  getGEOSuppFiles(my.gse, makeDirectory=T, baseDir="geo_downloads")

##list files in working directory
list.files("geo_downloads")
list.files(paste0("geo_downloads/",my.gse))

##function creates a directory and downloads the "tarball" that contains
##gzipped CEL files. We need to untar the tarball but we do not need to unzip
##the CEL files. R can read them in compressed format.
untar(paste0("geo_downloads/",my.gse,"/",my.gse,"_RAW.tar"), exdir=paste0("geo_downloads/",my.gse,"/CEL"))
list.files(paste0("geo_downloads/",my.gse,"/CEL"))

my.cels <- list.files(paste0("geo_downloads/",my.gse,"/CEL"), pattern=".CEL")
my.cels

my.cels <- sort(my.cels)
my.cels
head(my.cels)
tail(my.cels)
class(my.cels)
### Rename column 4 has error (extra 2)
names(my.cels)[names(my.cels) == 'GSM780659_PF6__Rat230_2__2.CEL.gz'] <- 'GSM780659_PF6__Rat230_2__.CEL.gz'

tail(my.cels)

##### didn't work download file to fix name and re-upload
x <- as.data.frame(my.cels)

### To excel
write.csv(x , file = 'datas-cleaned11.csv')

#### Bring Back into R
my.cels <- datas_cleaned11 
summary(my.cels)
class(my.cels)
my.cels
################################################################################################################################################################################################################################
########### Processing Microarray Data
####################################################################################################################
######Now that we have the CEL files and associated metadata, we can reprocess these microarrays and analyze the results with our preferred methods.
##make data frame of phenoData
my.pdata <- as.data.frame(pData(my.geo.gse), stringsAsFactors=F)
head(my.pdata)
colnames(my.pdata)

my.pdata <- my.pdata[, c("geo_accession","treatment:ch1" )]
my.pdata <- my.pdata[order(rownames(my.pdata)), ]
my.pdata <- my.pdata[, c("title", "geo_accession","treatment:ch1")]
my.pdata <- my.pdata[order(rownames(my.pdata)), ]
head(my.pdata, 10)
my.pdata

##the rownames of the data frame must be the same as the CEL file names.
##For this data, we need to do a bit of tinkering.
rownames(my.pdata) == my.cels
table(rownames(my.pdata) == my.cels)
head(my.cels)
head(rownames(my.pdata))
temp.rownames <- paste(rownames(my.pdata), ".CEL.gz", sep="")
table(temp.rownames == my.cels)
rownames(my.pdata) <- temp.rownames
rm(temp.rownames)
table(rownames(my.pdata) == my.cels)
###### Check to see if everything now matches
my.pdata
######

write.table(my.pdata, file=paste0("geo_downloads/",my.gse,"/CEL/",my.gse,"_SelectPhenoData.txt"), sep="\t", quote=F)
################################################################################################################
## CEL files and a corresponding data frame with the phenoData, we can read the CEL files into R with the ReadAffy function.
###This formats the data as an AffyBatch object that includes the experimental and phenodata.
################################################################################################################################
##Perform affy normalization

cel.path <- paste0("geo_downloads/",my.gse,"/CEL")
my.affy <- ReadAffy(celfile.path=cel.path, phenoData=paste(cel.path, paste0(my.gse,"_SelectPhenoData.txt"), sep="/"))
show(my.affy)

head(exprs(my.affy))
pData(my.affy)

##create shorter descriptive levels and labels
pData(my.affy)$sample.levels <- c(rep("PF", 4), rep("ZD_10_days", 4), rep("ZR_PF", 4), rep("ZR", 4))
pData(my.affy)$sample.labels <- c(paste("PF", 1:4, sep="."), paste("ZD_10_days", 1:4, sep="."), paste("ZR_PF", 1:4, sep="."), paste("ZR", 1:4, sep="."))
table(pData(my.affy)$sample.levels)

table(pData(my.affy)$treatment.ch1, pData(my.affy)$sample.levels)
###################################################################################################################################################
#### Calculate Gene Expression
##Calculate gene level expression measures
my.rma <- rma(my.affy, normalize=F, background=F)
head(exprs(my.rma))
dim(exprs(my.rma))

pData(my.rma)

#### Plot Densities
plotDensity(exprs(my.rma))

##Boxplot is another way of showing the same thing. Add color
level.pal <- brewer.pal(6, "Dark2")
level.pal

boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Not Normalized")
###Save Data
pdf(file="results/DensityNoNorm.pdf", w=6, h=6)
boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Not Normalized")
dev.off()

######Normalize Arrays
##Now use normalization and background correction
my.rma <- rma(my.affy, normalize=T, background=T)
boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Normalized")
### Save Results
pdf(file="results/DensityNorm.pdf", w=6, h=6)
boxplot(exprs(my.rma), las=2, names=pData(my.rma)$sample.labels, outline=F, col=level.pal, main="Arrays Normalized")
dev.off()

##save rma values
write.table(exprs(my.rma), file=paste0("results/",my.gse,"_RMA_Norm.txt"), sep="\t", quote=FALSE)

####Turn Normalized Data into Dataframe
my.data1 <- as.data.frame(pData(my.rma), stringsAsFactors=F)
my.data1
########################################################################################################
####QC
### Investigate
plotMDS(exprs(my.rma), labels=pData(my.rma)$sample.labels, top=500, gene.selection="common", main="MDS Plot to Compare Replicates")
##Save
pdf(file="results/MDS_plot.pdf", w=6, h=6)
plotMDS(exprs(my.rma), labels=pData(my.rma)$sample.labels, top=500, gene.selection="common", main="MDS Plot to Compare Replicates")
dev.off()

#####We can visualize the correlations between the arrays by hierarchical clustering. 
####First, we need to scale the rma values so that highly-expressed gene do not dominate the result. To do this, we are going to produce a matrix of gene-specific z-scores from the rma values. To do this, you must calculate the mean expression and standard deviation for each gene across all samples. The z-score is calculated with the following formula:
###z-score = (rma-sample - rma-mean) / rma-sd
###Thus, the z-score indicates how many standard deviations an observed rma value is above or 
##below the mean.

cluster.dat <- exprs(my.rma)
gene.mean <- apply(cluster.dat, 1, mean)
gene.sd <- apply(cluster.dat, 1, sd)
cluster.dat <- sweep(cluster.dat, 1, gene.mean, "-")
cluster.dat <- sweep(cluster.dat, 1, gene.sd, "/")
my.dist <- dist(t(cluster.dat), method="euclidean")
my.hclust <- hclust(my.dist, method="average")
my.hclust$labels <- pData(my.rma)$sample.labels
plot(my.hclust, cex=0.75, main="Comparison of Biological Replicates", xlab="Euclidean Distance")
###Save
pdf(file="results/Cluster_plot.pdf")
plot(my.hclust, cex=0.75, main="Comparison of Biological Replicates", xlab="Pearson Distance")
dev.off()

##########################################################################################################
###Specify Design
## Create a single variable
   
 group <- with(pData(my.rma),sample.levels, pData(my.rma))
 group <- factor(group)

# Create design matrix with no intercept
my.design <- model.matrix(~0 + group)
colnames(my.design) <- levels (group)
colSums(my.design)

my.design

#### Fitting the Coefficients#####
##determine the average effect (coefficient) for each treatment
my.fit <- lmFit(my.rma, my.design)
my.fit

write.table(my.fit$coefficients, file=paste0("results/",my.gse,"_Limma_Coeff.txt"), sep="\t", quote=F)
 
##specify the contrast of interest using the levels from the design matrix (not significant interaction)
my.contrasts <- makeContrasts(PFvsZD = PF - ZD_10_days, ZRvsZRPF = ZR_PF - ZR,
                              ZR_PFvsPF = ZR_PF - PF, ZRvsZD10days = ZR - ZD_10_days, 
                               levels = my.design)
my.contrasts


###Statistics

# Fit the model from above
my.fit <- lmFit(my.rma, my.design)
my.fit

# Fit the contrasts
fit2 <- contrasts.fit(my.fit, contrasts = my.contrasts)
fit2

# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
fit2

# decideTests will make calls for DEGs by adjusting the p-values and applying a logFC cutoff similar to topTable
# Summarize results (Strictest interpretation)
 results <- decideTests(fit2,adjust.method="BH", p.value=0.05,lfc=1)
 results
 summary(results)
 head(results)
 ?decideTests
 write.csv(results , file = 'datas-cleaned23.csv')
 ######################################################################################################################################

##Plot Results
 # Create a Venn diagram
 vennDiagram(results)
 
 # Obtain the summary statistics for the contrast ZRvsZD10days & ZRPFvsPF
 stats_ZRvsZD10days <- topTable(fit2, coef = "ZRvsZD10days", number = nrow(fit2),                                                          
                            sort.by="B", resort.by="logFC")
 stats_ZRvsZD10days
 
 
 print (stats_ZRvsZD10days)
 
 
 write.csv(stats_ZRvsZD10days , file = 'datas-cleaned24.csv')
 
stats_ZRPFvsPF <- topTable(fit2, coef = "ZR_PFvsPF", number = nrow(fit2),                                                          
                                                   sort.by="B", resort.by="logFC")

print (stats_ZRPFvsPF)

####Create histograms of the p-values for each contrast
 
hist(stats_ZRPFvsPF[, "P.Value"], col=level.pal, main= "Contrast between Pair Feed  vs. Zinc Recovered Pair Feed Rats",
xlab="P-Values")

hist(stats_ZRvsZD10days[, "P.Value"],col=level.pal, main= "Contrast between Zinc Deficient vs. Zinc Recovered Rats", xlab="P-Values")

##### Volcano Plot##############

# Create a volcano plot for the contrast ZR_PFvsPF
volcanoplot(fit2, coef="ZR_PFvsPF",highlight = 10)

####Create a volcano plot for the contrast ZRvsZD10days
 volcanoplot(fit2, coef = "ZRvsZD10days",highlight = 10)
 
###################################################################################################################################
#############################################################################################################################################################
##############################################################################################################################################################
#### BN Learn Bayesian Graph
library(dplyr)
library(ggplot2)
library(bnlearn)
library(gRain)
 
#####################################################################################################################
###Step :3 Discredited Gene Expression & Boot
#####################################################################################################################
 
 ###decideTests will make calls for DEGs by adjusting the p-values and applying a logFC cutoff similar to topTable
 # Summarize results (Strictest interpretation) (data listed as discretized)
 results <- decideTests(fit2,adjust.method="BH", p.value=0.05,lfc=1)
 results
 summary(results)
 head(results)
 ?decideTests
 write.csv(results , file = 'datas-cleaned23.csv')
   
###Bring Back In 
#### Keep top genes that are different
bn5 <- bn2 %>% mutate_if(is.character,as.factor)
class(bn5)
print(bn5)
str(bn5)


####################################################################################################################
#######Step 4: Model Network
######################################################################################################################


### Create  network by listing out your nodes
dag <- empty.graph(nodes = c("Nupr1l1","Scaf1",
                            "Atp1a3","Cirbp","Gjb2","Pik3ca",
                            "1376534_at","1378107_at","1379132_at",
                            "Dzip3","1380027_at","Fam193a",
                            "1381099_at","Arid3a","Map1a"))



#####  Set DAG arcs ( this gives you the direction of your network)
arc.set1 <- matrix(c("Dzip3","Nupr1l1",
                     "Dzip3","Scaf1",
                     "Dzip3" ,"Atp1a3",
                     "Dzip3", "Cirbp",
                     "Dzip3","Gjb2",  
                     "Dzip3", "Pik3ca",   
                     "Dzip3","1376534_at",   
                     "Dzip3","1378107_at", 
                     "Dzip3","1379132_at",
                     "Dzip3","1380027_at",
                     "Dzip3", "Fam193a",
                     "Dzip3", "1381099_at",
                     "Dzip3", "Arid3a",
                     "Dzip3", "Map1a"),
                   byrow = TRUE, ncol = 2,
                   dimnames = list(NULL, c("from", "to")))

arcs(dag) <- arc.set1
nodes(dag)
dag                

################## Plots
graphviz.plot(dag, layout = "fdp", shape = "ellipse" )