# 25-1-2021 JHZ

caprion_xlsx <- function()
{
  library(openxlsx)
  # list
  protein_list <- read.xlsx("Caprion_pilot_protein_list.xlsx")
  # workbook
  wb <- "ZWK_EDR_20191002.xlsx"
  Legend <- read.xlsx(wb, sheet = 1, startRow = 3)
  Samples <- read.xlsx(wb, sheet = 2, startRow = 5)
  Annotations <- read.xlsx(wb, sheet = 3, startRow = 5)
  rawIGs <- read.xlsx(wb, sheet = 4, startRow = 5)
  Normalized_Peptides <- read.xlsx(wb, sheet = 5, startRow = 5)
  Protein_All_Peptides <- read.xlsx(wb, sheet = 6, startRow = 5)
  Protein_DR_Filt_Peptides <- read.xlsx(wb, sheet = 7, startRow = 5)
  save(protein_list,Legend,Samples,Annotations,rawIGs,Normalized_Peptides,Protein_All_Peptides,Protein_DR_Filt_Peptides,file="caprion.rda")
}

tromso_xlsx <- function()
{
  wb <- "solomon18.xlsx"
  library(openxlsx)
  sheet3 <- read.xlsx(wb, sheet = 3, startRow = 1)
  list(sheet3=sheet3)
}

caprion_inf <- function()
{
  tmp <- read.delim("inf1.tmp",as.is=TRUE,col.names=c("prot","uniprot"))
  olink_inf1 <- read.delim("olink.inf.panel.annot.tsv",as.is=TRUE)[c("target","target.short","uniprot","panel","hgnc_symbol")]
  inf1 <- merge(tmp,olink_inf1,by="uniprot")
  caprion_chk <- merge(inf1,protein_list[c("Accession","Protein","Gene")],by.x="uniprot",by.y="Accession")
  write.csv(caprion_chk,file="caprion_inf1.chk",row.names=FALSE,quote=FALSE)
  source("utils/olink.inc")
  xlsx <- "Olink validation data all panels.xlsx"
  tabs <- c("CVD II","CVD III","Inflammation","Neurology")
  olink_panel(xlsx,tabs,TRUE,92,FALSE)
  caprion_inf1 <- merge(protein_list[c("Accession","Protein","Gene")],Inflammation[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.")
  write.csv(caprion_inf1,file="caprion_inf1.csv",row.names=FALSE,quote=FALSE)
  caprion_cvd2 <- merge(protein_list[c("Accession","Protein","Gene")],CVD_II[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.")
  write.csv(caprion_cvd2,file="caprion_cvd2.csv",row.names=FALSE,quote=FALSE)
  caprion_cvd3 <- merge(protein_list[c("Accession","Protein","Gene")],CVD_III[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No")
  write.csv(caprion_cvd3,file="caprion_cvd3.csv",row.names=FALSE,quote=FALSE)
  caprion_neurology <- merge(protein_list[c("Accession","Protein","Gene")],Neurology[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No")
  write.csv(caprion_neurology,file="caprion_neurology.csv",row.names=FALSE,quote=FALSE)
  somalogic <- read.delim("SOMALOGIC_Master_Table_160410_1129info.tsv",as.is=TRUE)
  caprion_somalogic <- merge(protein_list[c("Accession","Protein","Gene")],somalogic[c("Target","UniProt")],by.x="Accession",by.y="UniProt")
  write.csv(unique(caprion_somalogic),file="caprion_somalogic.csv",row.names=FALSE,quote=FALSE)
}

plotfun <- function(col) 
{
  d <- df[,col]
  xlab <- "Individual"
  ylab <- colnames(df)[col]
  plot(d, xlab=xlab, ylab = ylab, main=ylab, type = "p", cex=0.6)
  hist(d, xlab=ylab, main="")
  boxplot(d, horizontal=TRUE, cex=0.6)
}

regfun <- function(col)
{
  p <- df[,col]
  l <- lm(p~sex)
  s <- summary(l)
  c <- with(s,coefficients)
  r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t")
  cat(r,"\n",append=TRUE, file="sex.tsv", sep="")
  l <- lm(p~age+sex+bmi)
  s <- summary(l)
  c <- with(s,coefficients)
  r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t")
  cat(r,"\n",append=TRUE, file="lm.tsv", sep="")
}

ae <- function(X,hidden.layers=c(10,2,10),pdf="ae.pdf")
{
   require(ANN2)
   pdf(pdf)
   AE <- autoencoder(X, hidden.layers, loss.type = 'pseudo-huber',
                     activ.functions = c('tanh','linear','tanh'),
                     batch.size = 8, optim.type = 'adam',
                     n.epochs = 1000, val.prop = 0)
 # Plot loss during training
   plot(AE)
 # Make reconstruction and compression plots
   reconstruction_plot(AE, X)
   compression_plot(AE, X)
 # Reconstruct data and show states with highest anomaly scores
   recX <- reconstruct(AE, X)
   sort(recX$anomaly_scores, decreasing = TRUE)
   dev.off()
   save(AE,recX,file="ae.rda")
}

minmax <- function(x) (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))

ae_caprion <- function(x,hidden.layers=c(987,197,987))
{
  library(ggplot2)
  library(plotly)
  library(keras)
  x_train <- apply(x, 2, minmax)
  x_train <- as.matrix(x_train)
  model <- keras_model_sequential()
  model %>%
    layer_dense(units = hidden.layers[1], activation = "tanh", input_shape = ncol(x_train)) %>%
    layer_dense(units = hidden.layers[2], activation = "tanh", name = "bottleneck") %>%
    layer_dense(units = hidden.layers[3], activation = "tanh") %>%
    layer_dense(units = ncol(x_train))
  summary(model)
  model %>% compile(loss = "mean_squared_error", optimizer = "adam")
  model %>% fit(x = x_train, y = x_train, epochs = 2000, verbose = 0)
  mse.ae2 <- evaluate(model, x_train, x_train)
  print(mse.ae2)
  intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output)
  intermediate_output <- predict(intermediate_layer_model, x_train)
  pred <- model %>% predict(x_train)
  (x_train-pred)^2
}

extract_peptide <- function(protein="ERAP2")
{
  load("caprion.rda")
  d <- subset(Normalized_Peptides,Protein==paste0(protein,"_HUMAN"))
  tokeep <- -(1:6)
  dt <- t(d[,tokeep])
  colnames(dt) <- with(d,paste(Modified.Peptide.Sequence,Isotope.Group.ID,sep="_"))
  peptides <- data.frame(caprion_id=names(d)[tokeep],dt)
}

tromso_sample <- function()
# replication of TROMSO study
{
  d <- tromso_xlsx()
  pp <- names(table(with(d,sheet3["Protein.Name"])))
  overlap <- colnames(t1)[colnames(t1)%in%pp]
  selected <- with(d, sheet3[["Protein.Name"]] %in% overlap)
  s <- with(d, sheet3)[selected,c("Protein.Name","Ensembl.ID","Sentinel.Variant")]
  rsid <- levels(as.factor(s$Sentinel.Variant))
  ps <- phenoscanner::phenoscanner(snpquery=rsid, catalogue="pQTL")
  snps <- with(ps,snps)
  print(sort(as.numeric(levels(with(snps,chr)))))
  tromso <- merge(s,snps,by.x="Sentinel.Variant",by.y="snp")
  write.table(tromso[c("Sentinel.Variant","Protein.Name","chr")],
              file="tromso/tromso.txt",col.names=FALSE,row.names=FALSE,quote=FALSE)
  prot <- pheno_protein[c("caprion_id","affymetrix_gwasqc_bl",overlap)]
  prot <- within(prot,{for(i in names(prot[,-(1:2)])) assign(paste0(i,"_invn"),gap::invnormal(prot[i]))})
  prot <- prot[,-ncol(prot)]
  id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,prot[,-1],
                                                 by.x="ID_1",by.y="affymetrix_gwasqc_bl")
  m <- merge(id1_id2_missing[,-3],id1_id2_missing_covariates_phenotypes,by=c("ID_1","ID_2"),all.x=TRUE)
  gap::snptest_sample(m,"tromso/tromso.sample",
                      C=c("age","bmi",paste0("PC",1:20)),
                      D="sex",
                      P=names(prot[,-(1:2)]))
}

peptides_sample <- function(select="SomaLogic")
# analysis on peptides
{
  p <- "ERAP2"
  d <- extract_peptide("ERAP2")
  id <- pheno_protein[,1:2]
  peptides <- merge(id,d,by="caprion_id")
  peptides <- within(peptides,{for(i in names(peptides[,-(1:2)])) assign(paste0(i,"_invn"),gap::invnormal(peptides[i]))})
  peptides <- peptides[,-ncol(peptides)]
  id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,peptides[,-1],
                                                 by.x="ID_1",by.y="affymetrix_gwasqc_bl")
  gap::snptest_sample(id1_id2_missing_covariates_phenotypes,paste0("ERAP2/",p,".sample"),
                      C=c("age","bmi",paste0("PC",1:20)),
                      D="sex",
                      P=names(peptides)[-(1:2)])
}

affymetrix <- function(select)
{
# phenotypes
  names(Samples) <- c("caprion_id","external_id","comment")
  phenotypes <- read.delim("interval_caprion_pilot_samples_phenotype_data.tsv",as.is=TRUE)
  phenotypes <- within(phenotypes,{
    age <- agepulse
    sex <- sexpulse
    bmi <- wt_bl/ht_bl/ht_bl
    crp <- crp_bl
    transf <- transf_bl
  })
  affymetrix.id <- with(phenotypes,affymetrix_gwasqc_bl)
  write.table(affymetrix.id[!is.na(affymetrix.id)],file="affymetrix.id",row.names=FALSE,col.names=FALSE,quote=FALSE)
  pd <- merge(phenotypes[c("caprion_id","affymetrix_gwasqc_bl","sex","age","bmi","crp","transf")],Samples,by="caprion_id")
  td <- Protein_All_Peptides
  rownames(td) <- gsub("_HUMAN","",td[,1])
  pap <- merge(protein_list[c("Protein","Accession","Gene")], td, by="Protein")
  t1 <- t(td[,-1])
  td <- data.frame(caprion_id=row.names(t1),data.frame(t1))
  uniprot <- merge(protein_list[c("Protein","Accession")],Protein_All_Peptides,by="Protein")
  rownames(uniprot) <- uniprot[["Accession"]]
  cat(uniprot[["Accession"]], file="caprion.uniprot")
  d <- Protein_All_Peptides[,-1]
# proteins
  library(biomaRt)
  ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="grch37.ensembl.org", path="/biomart/martservice")
  attr <- listAttributes(ensembl)
  g <- c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'description', 'hgnc_symbol')
  t <- c('ensembl_transcript_id', 'transcription_start_site', 'transcript_start', 'transcript_end')
  gtu <- getBM(attributes = c(g,t,"uniprotswissprot"), mart = ensembl)
  under_score <- with(gtu,grep("_",chromosome_name))
  selected <- setdiff(1:nrow(gtu),under_score)
  ap <- merge(Annotations[,1:3],Protein_DR_Filt_Peptides,by="Protein")
  rownames(ap) <- ap[,2]
  tap <- data.frame(caprion_id=names(ap)[-(1:3)],t(ap[,-(1:3)]))
  idp <- subset(merge(pd[,1:2],tap,by="caprion_id"),!is.na(affymetrix_gwasqc_bl))
  ord <- with(idp,order(affymetrix_gwasqc_bl))
  m <- idp[ord,]
  rownames(m) <- m[["affymetrix_gwasqc_bl"]]
  mm <- data.frame(Accession=names(m)[-(1:2)],t(m[,-(1:2)]))
  bed <- merge(gtu[selected,c("chromosome_name","start_position","end_position","uniprotswissprot","ensembl_gene_id")],mm,
               by.y="Accession",by.x="uniprotswissprot")
  header <- names(bed)[-1]
  header[1:4] <- c("#chr","start","end","gene_id")
  cat(header,file="data/caprion.bed",sep="\t")
  cat("\n",file="data/caprion.bed",append=TRUE)
  bed <- within(bed,{chr <- as.numeric(chromosome_name)})
  ord <- with(bed,order(chr,start_position,end_position))
  reset <- with(bed,is.na(chromosome_name))
  bed[reset,5] <- bed[reset, 1]
  bed[reset,2] <- 99+1:nrow(bed[reset,])
  bed[reset,3] <- 1
  bed[reset,4] <- 2
  write.table(bed[ord,-c(1,ncol(bed))],file="data/caprion.bed",append=TRUE,col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
# SNPTEST
  id1_id2_0 <- read.table("interval.samples",skip=2,col.names=c("ID_1","ID_2","missing"))
  missing <- read.table("data/merged_imputation.missing",col.names=c("affymetrix_gwasqc_bl","missing"))
  id1_id2_missing <- merge(id1_id2_0[,-3],missing,by.x="ID_1",by.y="affymetrix_gwasqc_bl")
  eigenvec <- read.delim("data/merged_imputation.eigenvec")
  if (select=="SomaLogic")
  {
    pheno_protein <- merge(pd,td,by="caprion_id")
    df <- pheno_protein[,-(1:8)]
    sample.file <- "SomaLogic.sample"
    uniprot <- scan("SomaLogic.uniprot",what="")
    SomaLogic <- subset(pap,Accession%in%uniprot)
    d1 <- t(SomaLogic[,-c(1:3)])
    d1 <- data.frame(caprion_id=rownames(d1),round(d1,3))
    protein <- merge(phenotypes[c("caprion_id","affymetrix_gwasqc_bl")],d1,by="caprion_id")
    p <- protein[,-1]
    p <- within(p,{for(i in names(p)[-1]) assign(paste0(i,"_invn"), gap::invnormal(p[i]))})
    p <- p[,-ncol(p)]
    covariates <- merge(pheno_protein[c("affymetrix_gwasqc_bl","sex","age","bmi")],eigenvec[,-1],
                        by.x="affymetrix_gwasqc_bl",by.y="IID")
    id1_id2_missing_covariates <- merge(id1_id2_missing,covariates,by.x="ID_1",by.y="affymetrix_gwasqc_bl")
    id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,p,
                                                   by.x="ID_1",by.y="affymetrix_gwasqc_bl")
    expr <- id1_id2_missing_covariates[,-(1:3)]
    rownames(expr) <- id1_id2_missing_covariates[,1]
    write.table(t(expr),"data/1.covariates.txt",col.names=TRUE,quote=FALSE,sep="\t")
  } else {
    sample.file <- "data/caprion.sample"
    tu <- t(uniprot[,-(1:2)])
    td <- data.frame(caprion_id=rownames(tu),tu)
    pheno_protein <- merge(pd,td,by="caprion_id")
    p <- pheno_protein[,-c(1,3:9)]
    p <- within(p,{for(i in names(p)[-1]) assign(paste0(i,"_invn"), gap::invnormal(p[i]))})
    p <- p[,-ncol(p)]
    covariates <- merge(pheno_protein[c("affymetrix_gwasqc_bl","sex","age","bmi")],eigenvec[,-1],
                        by.x="affymetrix_gwasqc_bl",by.y="IID")
    id1_id2_missing_covariates <- merge(id1_id2_missing,covariates,by.x="ID_1",by.y="affymetrix_gwasqc_bl")
    id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,p,
                                                   by.x="ID_1",by.y="affymetrix_gwasqc_bl")
  }
  gap::snptest_sample(id1_id2_missing_covariates_phenotypes,sample.file,
                      C=c("age","bmi",paste0("PC",1:20)),
                      D="sex",
                      P=names(p)[-1])
# exclusion
  excl_id <- read.table("11-3.id", as.is=TRUE, header=TRUE)
  eleven <- excl_id[1:11,1]
  three <- excl_id[12:14,1]
  idx11 <- colnames(d)%in%eleven
  idx3 <- colnames(d)%in%three
  group <- rep(1,ncol(d))
  group[idx11] <- 2
  group[idx3] <- 3
  col.group=c("black","blue","red")
}

# awk 'a[$0]++<1' caprion.bed | awk -vOFS="\t" '{if(NR==1) gsub(/X/,"",$0); else $1="chr" $1};1' | gzip -f > data/caprion.expression.bed.gz