# 9-3-2020 JHZ

UniProt <- function()
{
  library(UniProt.ws)
  x <- UniProt.ws(taxId=9606)
  columns(x)
  keytypes(x)
  keys <- with(hgTables,entrezGene)
  columns <- c("PDB","GENES","HGNC","PROTEIN-NAMES","UNIPROTKB")
  keytypes <- c("ENTREZ_GENE")
  UniProt.info <- select(x,keys,columns,keytypes)
  save(UniProt.info,file="UniProt.info.rda")
  library(org.Hs.eg.db)
  columns(org.Hs.eg.db)
  s <- select(org.Hs.eg.db,keys,c("UNIPROT","ENSEMBL","ENSEMBLPROT","SYMBOL","ENSEMBLTRANS"))
  require(hgu95av2.db)
  x <- hgu95av2UNIPROT
  mapped_genes <- mappedkeys(x)
  xx <- as.list(x[mapped_genes])
  if(length(xx) > 0) xx[1:5]
}

swath_ms_data <- function()
{
  swath_pheno <- read.csv("INTERVALdata_13FEB2020.csv",as.is=TRUE)
  idmap <- read.csv("omicsMap.csv",as.is=TRUE)
  vars <- c("Affymetrix_gwasQC_bl","caprion_id","swathMS_id","sexPulse","agePulse","ht_bl","wt_bl","CRP_bl","FERR_bl","TRANSFS_bl","TRANSF_bl","HEP_bl")
  swath_pheno <- within(merge(idmap,swath_pheno,by="identifier")[vars],{sex=sexPulse;age=agePulse;bmi=wt_bl/ht_bl/ht_bl})
# UCSC Uniprot IDs
  hgTables <- read.delim("hgTables.gz",quote="")
  load(paste(Sys.getenv("INF"),"Caprion","caprion.rda",sep="/"))
  m <- merge(Protein_All_Peptides,protein_list,by="Protein")
  id1 <- with(m,Accession)
  library(openxlsx)
  rds.xlsx <- "DIINT01_BATCH_1_MS_BATCH_INPUT_RDS_02DEC19 final.xlsx"
  protein_matrix.file <- "DIINT_protein_matrix final.xlsx"
  rds <- read.xlsx(rds.xlsx, sheet = 1, startRow = 1)
  protein_matrix <- read.xlsx(protein_matrix.file, sheet = 1, startRow = 1)
  id <- protein_matrix[-1,"Internal.ID"]
  o <-id[!id%in%with(hgTables,acc)]
  f <- function(i) {index <- grep(o[i],hgTables$accList); cat(index,o[i],"\n"); as.character(hgTables$accList[index])}
  for(i in 1:6) print(f(i))
# P01621, P01771, P01776, P01781, P08107, P62158
# P01619, P01772, P01764, P01780, P08107, P62158
# P08107 and P62158 were obsolete on May 27 and 10, 2015, respectively
  r <- c("P01619", "P01772", "P01764", "P01780", "P08107", "P62158")
  protein_matrix <- within(protein_matrix,{Internal.ID[Internal.ID%in%o] <- r})
  p <- t(protein_matrix[-1,-1])
  colnames(p) <- protein_matrix[-1,"Internal.ID"]
  rownames(p) <- protein_matrix[1,-1]
  head(p[,colnames(p)%in%o])
  swath_protein <- data.frame(Internal.ID=colnames(protein_matrix)[-1],External.ID=rownames(p), p)
  swath_protein <-merge(swath_pheno[c("Affymetrix_gwasQC_bl","swathMS_id")],swath_protein[,-1],by.x="swathMS_id",by.y="External.ID")
  ord <- with(swath_protein,order(Affymetrix_gwasQC_bl))
  swath_protein <- swath_protein[ord,]
  hgTables_vars <- c("acc","accList","uniprotName","ensGene","geneName")
# under_score <- with(hgTables,grep("_",X.chrom))
# selected <- setdiff(1:nrow(hgTables),under_score)
# m <- merge(protein_matrix[-1,],hgTables[selected,c("X.chrom","chromStart","chromEnd","name")],by.x="Internal.ID",by.y="name")
  annotated_protein_matrix <- merge(data.frame(Accession=protein_matrix[-1,1]),hgTables[hgTables_vars],by.x="Accession",by.y="acc")
# biomart annotation
  library(biomaRt)
  ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="grch37.ensembl.org", path="/biomart/martservice")
  attr <- listAttributes(ensembl)
  g <- c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'description', 'hgnc_symbol')
  t <- c('ensembl_transcript_id', 'transcription_start_site', 'transcript_start', 'transcript_end')
  gtu <- getBM(attributes = c(g,t,"uniprotswissprot"), mart = ensembl)
  under_score <- with(gtu,grep("_",chromosome_name))
  selected <- setdiff(1:nrow(gtu),under_score)
  p <- swath_protein[,-(1:2)]
  rownames(p) <- swath_protein[,1]
  tp <- data.frame(uniprot=colnames(p),t(p))
  bed <- merge(gtu[selected,c("chromosome_name","start_position","end_position","uniprotswissprot","ensembl_gene_id")],tp,
               by.y="uniprot",by.x="uniprotswissprot")
  header <- names(bed)[-5]
  header[1:4] <- c("#chr","start","end","gene_id")
  cat(header,file="swath-ms.bed",sep="\t")
  cat("\n",file="swath-ms.bed",append=TRUE)
  bed <- within(bed,{chr <- as.numeric(chromosome_name)})
  ord <- with(bed,order(chr,start_position,end_position))
  reset <- with(bed,is.na(chromosome_name))
  bed[reset,5] <- bed[reset, 1]
  bed[reset,2] <- 99+1:nrow(bed[reset,])
  bed[reset,3] <- 1
  bed[reset,4] <- 2
  write.table(bed[ord,-c(1,ncol(bed))],file="swath-ms.bed",append=TRUE,col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
  save(hgTables,annotated_protein_matrix,protein_matrix,swath_protein,swath_pheno,bed,rds,file="swath-ms.rda")
}
# awk 'a[$0]++<1' swath-ms.bed | awk -vOFS="\t" '{if(NR==1) gsub(/X/,"",$0); else $1="chr" $1};1' | gzip -f > swath-ms.expression.bed.gz

plotfun <- function(col)
{
  d <- df[,col]
  xlab <- "Individual"
  ylab <- colnames(df)[col]
  plot(d, xlab=xlab, ylab = ylab, main=ylab, type = "p", cex=0.6)
  hist(d, xlab=ylab, main="")
  boxplot(d, horizontal=TRUE, cex=0.6)
}

regfun <- function(col)
{
  p <- df[,col]
  l <- lm(p~sex,data=pheno_protein)
  s <- summary(l)
  c <- with(s,coefficients)
  r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t")
  cat(r,"\n",append=TRUE, file="sex.tsv", sep="")
  l <- lm(p~age+sex+bmi,data=pheno_protein)
  s <- summary(l)
  c <- with(s,coefficients)
  r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t")
  cat(r,"\n",append=TRUE, file="lm.tsv", sep="")
}

minmax <- function(x) (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))

ae_swath <- function(x,hidden.layers=c(987,197,987))
{
  library(ggplot2)
  library(plotly)
  library(keras)
  x_train <- apply(x, 2, minmax)
  x_train <- as.matrix(x_train)
  model <- keras_model_sequential()
  model %>%
    layer_dense(units = hidden.layers[1], activation = "tanh", input_shape = ncol(x_train)) %>%
    layer_dense(units = hidden.layers[2], activation = "tanh", name = "bottleneck") %>%
    layer_dense(units = hidden.layers[3], activation = "tanh") %>%
    layer_dense(units = ncol(x_train))
  summary(model)
  model %>% compile(loss = "mean_squared_error", optimizer = "adam")
  model %>% fit(x = x_train, y = x_train, epochs = 2000, verbose = 0)
  mse.ae2 <- evaluate(model, x_train, x_train)
  print(mse.ae2)
  intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output)
  intermediate_output <- predict(intermediate_layer_model, x_train)
  pred <- model %>% predict(x_train)
  (x_train-pred)^2
}

swath_overlap <- function()
{
  id2 <- data.frame(Accession=protein_matrix[-1,1])
  dir <- paste(Sys.getenv("INF"),"Caprion",sep="/")
  source(paste(dir,"utils/olink.inc",sep="/"))
  tmp <- read.delim(paste(dir,"inf1.tmp",sep="/"),as.is=TRUE,col.names=c("prot","uniprot"))
  olink_inf1 <- read.delim(paste(dir,"olink.inf.panel.annot.tsv",sep="/"),
                           as.is=TRUE)[c("target","target.short","uniprot","panel","hgnc_symbol")]
  inf1 <- merge(tmp,olink_inf1,by="uniprot")
  swath_chk <- merge(inf1,id2,by.x="uniprot",by.y="Accession")
  write.csv(swath_chk,file="swath_inf1.chk",row.names=FALSE,quote=FALSE)
  xlsx <- paste(dir,"Olink validation data all panels.xlsx",sep="/")
  tabs <- c("CVD II","CVD III","Inflammation","Neurology")
  olink_panel(xlsx,tabs,TRUE,92,FALSE)
  swath_inf1 <- merge(id2,Inflammation[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.")
  write.csv(swath_inf1,file="swath_inf1.csv",row.names=FALSE,quote=FALSE)
  swath_cvd2 <- merge(id2,CVD_II[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.")
  write.csv(swath_cvd2,file="swath_cvd2.csv",row.names=FALSE,quote=FALSE)
  swath_cvd3 <- merge(id2,CVD_III[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No")
  write.csv(swath_cvd3,file="swath_cvd3.csv",row.names=FALSE,quote=FALSE)
  swath_neurology <- merge(id2,Neurology[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No")
  write.csv(swath_neurology,file="swath_neurology.csv",row.names=FALSE,quote=FALSE)
  somalogic <- read.delim(paste(dir,"SOMALOGIC_Master_Table_160410_1129info.tsv",sep="/"),as.is=TRUE)
  swath_somalogic <- merge(id2,subset(somalogic,UniProt!="NA")[c("Target","UniProt")],by.x="Accession",by.y="UniProt")
  write.csv(unique(swath_somalogic),file="swath_somalogic.csv",row.names=FALSE,quote=FALSE)
  swath_caprion <- merge(id2,protein_list[c("Protein","Accession","Gene")],by="Accession")
  write.csv(unique(swath_caprion),file="swath_caprion.csv",row.names=FALSE,quote=FALSE)
  library(reshape)
  Inflammation <- rename(Inflammation,c(UniProt.No.="UniProt.No"))
  CVD_II <- rename(CVD_II,c(UniProt.No.="UniProt.No"))
  Olink <- rbind(Inflammation[c("Target","UniProt.No")],CVD_II[c("Target","UniProt.No")],
                 CVD_III[c("Target","UniProt.No")],Neurology[c("Target","UniProt.No")])
  Olink <- within(subset(Olink,UniProt.No!="NA"), {OlinkID=UniProt.No; OlinkTarget=Target})
  SomaLogic <- within(subset(somalogic,UniProt!="NA"),{SomaLogicID=UniProt;SomaLogicTarget=Target})
  swath_olink <- merge(id2,Olink[c("UniProt.No","OlinkID","OlinkTarget")],by.x="Accession",by.y="UniProt.No")
  swath_olink_somalogic <- merge(swath_olink,SomaLogic[c("SomaLogicTarget","UniProt","SomaLogicID")],by.x="Accession",by.y="UniProt")
  swath_olink_somalogic_caprion <- merge(swath_olink_somalogic,protein_list[c("Protein","Accession","Gene")],by="Accession")
  save(swath_olink_somalogic_caprion,file="swath-ms-overlap.rda")
  library(VennDiagram)
  plist <- list(id2[["Accession"]],protein_list[["Accession"]],Olink[["UniProt.No"]],SomaLogic[["UniProt"]])
  cnames=c("SWATH-MS", "Caprion", "Olink", "SomaLogic")
  venn.diagram(x = plist, category.names=cnames, filename='4_way_venn_diagram.png', imagetype="png", output=TRUE)
}

affymetrix <- function(select)
{
  affymetrix.id <- with(subset(swath_pheno,!is.na(swathMS_id)),Affymetrix_gwasQC_bl)
  write.table(affymetrix.id[!is.na(affymetrix.id)],file="affymetrix.id",row.names=FALSE,col.names=FALSE,quote=FALSE)
  id1_id2_0 <- read.table(paste(Caprion,"interval.samples",sep="/"),skip=2,col.names=c("ID_1","ID_2","missing"))
  missing <- read.table(paste(Caprion,"merged_imputation.missing",sep="/"),col.names=c("affymetrix_gwasqc_bl","missing"))
  id1_id2_missing <- merge(id1_id2_0[,-3],missing,by.x="ID_1",by.y="affymetrix_gwasqc_bl")
  eigenvec <- read.delim(paste(Caprion,"merged_imputation.eigenvec",sep="/"))
  covariates <- merge(swath_pheno[c("Affymetrix_gwasQC_bl","sex","age","bmi")],eigenvec[,-1],
                      by.x="Affymetrix_gwasQC_bl",by.y="IID")
  id1_id2_missing_covariates <- subset(merge(id1_id2_missing,covariates,by.x="ID_1",by.y="Affymetrix_gwasQC_bl"),ID_1%in%affymetrix.id)
  expr <- id1_id2_missing_covariates[,-(1:3)]
  rownames(expr) <- id1_id2_missing_covariates[,1]
  write.table(t(expr),"1.covariates.txt",col.names=TRUE,quote=FALSE,sep="\t")
  uniprot <- names(swath_protein[,-c(1:2)])
  cat(uniprot, file="swath-ms.uniprot")
  if (select=="SomaLogic")
  {
     sample.file <- "SomaLogic.sample"
     uniprot <- scan(paste(Caprion,"SomaLogic.uniprot",sep="/"),what="")
     SomaLogic <- subset(protein_matrix[-1,],Internal.ID%in%uniprot)
     names(SomaLogic) <- protein_matrix[1,]
     rownames(SomaLogic) <- SomaLogic[,1]
     d1 <- t(SomaLogic[,-1])
     d1 <- data.frame(swath_id=rownames(d1),round(d1,3))
     protein <- merge(swath_pheno[c("swathMS_id","Affymetrix_gwasQC_bl")],d1,by.x="swathMS_id",by.y="swath_id")
     p <- protein[,-1]
  } else {
     sample.file <- "swath-ms.sample"
     p <- swath_protein[,-1]
  }
  p <- within(p,{for(i in names(p)[-1]) assign(paste0(i,"_invn"), gap::invnormal(p[i]))})
  p <- p[,-ncol(p)]
  id1_id2_missing_covariates <- merge(id1_id2_missing,covariates,by.x="ID_1",by.y="Affymetrix_gwasQC_bl")
  id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,p,
                                                 by.x="ID_1",by.y="Affymetrix_gwasQC_bl")
  gap::snptest_sample(id1_id2_missing_covariates_phenotypes,sample.file,
                      C=c("age","bmi",paste0("PC",1:20)),
                      D="sex",
                      P=names(p)[-1])
}