# 25-1-2021 JHZ caprion_xlsx <- function() { library(openxlsx) # list protein_list <- read.xlsx("Caprion_pilot_protein_list.xlsx") # workbook wb <- "ZWK_EDR_20191002.xlsx" Legend <- read.xlsx(wb, sheet = 1, startRow = 3) Samples <- read.xlsx(wb, sheet = 2, startRow = 5) Annotations <- read.xlsx(wb, sheet = 3, startRow = 5) rawIGs <- read.xlsx(wb, sheet = 4, startRow = 5) Normalized_Peptides <- read.xlsx(wb, sheet = 5, startRow = 5) Protein_All_Peptides <- read.xlsx(wb, sheet = 6, startRow = 5) Protein_DR_Filt_Peptides <- read.xlsx(wb, sheet = 7, startRow = 5) save(protein_list,Legend,Samples,Annotations,rawIGs,Normalized_Peptides,Protein_All_Peptides,Protein_DR_Filt_Peptides,file="caprion.rda") } tromso_xlsx <- function() { wb <- "solomon18.xlsx" library(openxlsx) sheet3 <- read.xlsx(wb, sheet = 3, startRow = 1) list(sheet3=sheet3) } caprion_inf <- function() { tmp <- read.delim("inf1.tmp",as.is=TRUE,col.names=c("prot","uniprot")) olink_inf1 <- read.delim("olink.inf.panel.annot.tsv",as.is=TRUE)[c("target","target.short","uniprot","panel","hgnc_symbol")] inf1 <- merge(tmp,olink_inf1,by="uniprot") caprion_chk <- merge(inf1,protein_list[c("Accession","Protein","Gene")],by.x="uniprot",by.y="Accession") write.csv(caprion_chk,file="caprion_inf1.chk",row.names=FALSE,quote=FALSE) source("utils/olink.inc") xlsx <- "Olink validation data all panels.xlsx" tabs <- c("CVD II","CVD III","Inflammation","Neurology") olink_panel(xlsx,tabs,TRUE,92,FALSE) caprion_inf1 <- merge(protein_list[c("Accession","Protein","Gene")],Inflammation[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.") write.csv(caprion_inf1,file="caprion_inf1.csv",row.names=FALSE,quote=FALSE) caprion_cvd2 <- merge(protein_list[c("Accession","Protein","Gene")],CVD_II[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.") write.csv(caprion_cvd2,file="caprion_cvd2.csv",row.names=FALSE,quote=FALSE) caprion_cvd3 <- merge(protein_list[c("Accession","Protein","Gene")],CVD_III[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No") write.csv(caprion_cvd3,file="caprion_cvd3.csv",row.names=FALSE,quote=FALSE) caprion_neurology <- merge(protein_list[c("Accession","Protein","Gene")],Neurology[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No") write.csv(caprion_neurology,file="caprion_neurology.csv",row.names=FALSE,quote=FALSE) somalogic <- read.delim("SOMALOGIC_Master_Table_160410_1129info.tsv",as.is=TRUE) caprion_somalogic <- merge(protein_list[c("Accession","Protein","Gene")],somalogic[c("Target","UniProt")],by.x="Accession",by.y="UniProt") write.csv(unique(caprion_somalogic),file="caprion_somalogic.csv",row.names=FALSE,quote=FALSE) } plotfun <- function(col) { d <- df[,col] xlab <- "Individual" ylab <- colnames(df)[col] plot(d, xlab=xlab, ylab = ylab, main=ylab, type = "p", cex=0.6) hist(d, xlab=ylab, main="") boxplot(d, horizontal=TRUE, cex=0.6) } regfun <- function(col) { p <- df[,col] l <- lm(p~sex) s <- summary(l) c <- with(s,coefficients) r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t") cat(r,"\n",append=TRUE, file="sex.tsv", sep="") l <- lm(p~age+sex+bmi) s <- summary(l) c <- with(s,coefficients) r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t") cat(r,"\n",append=TRUE, file="lm.tsv", sep="") } ae <- function(X,hidden.layers=c(10,2,10),pdf="ae.pdf") { require(ANN2) pdf(pdf) AE <- autoencoder(X, hidden.layers, loss.type = 'pseudo-huber', activ.functions = c('tanh','linear','tanh'), batch.size = 8, optim.type = 'adam', n.epochs = 1000, val.prop = 0) # Plot loss during training plot(AE) # Make reconstruction and compression plots reconstruction_plot(AE, X) compression_plot(AE, X) # Reconstruct data and show states with highest anomaly scores recX <- reconstruct(AE, X) sort(recX$anomaly_scores, decreasing = TRUE) dev.off() save(AE,recX,file="ae.rda") } minmax <- function(x) (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) ae_caprion <- function(x,hidden.layers=c(987,197,987)) { library(ggplot2) library(plotly) library(keras) x_train <- apply(x, 2, minmax) x_train <- as.matrix(x_train) model <- keras_model_sequential() model %>% layer_dense(units = hidden.layers[1], activation = "tanh", input_shape = ncol(x_train)) %>% layer_dense(units = hidden.layers[2], activation = "tanh", name = "bottleneck") %>% layer_dense(units = hidden.layers[3], activation = "tanh") %>% layer_dense(units = ncol(x_train)) summary(model) model %>% compile(loss = "mean_squared_error", optimizer = "adam") model %>% fit(x = x_train, y = x_train, epochs = 2000, verbose = 0) mse.ae2 <- evaluate(model, x_train, x_train) print(mse.ae2) intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output) intermediate_output <- predict(intermediate_layer_model, x_train) pred <- model %>% predict(x_train) (x_train-pred)^2 } extract_peptide <- function(protein="ERAP2") { load("caprion.rda") d <- subset(Normalized_Peptides,Protein==paste0(protein,"_HUMAN")) tokeep <- -(1:6) dt <- t(d[,tokeep]) colnames(dt) <- with(d,paste(Modified.Peptide.Sequence,Isotope.Group.ID,sep="_")) peptides <- data.frame(caprion_id=names(d)[tokeep],dt) } tromso_sample <- function() # replication of TROMSO study { d <- tromso_xlsx() pp <- names(table(with(d,sheet3["Protein.Name"]))) overlap <- colnames(t1)[colnames(t1)%in%pp] selected <- with(d, sheet3[["Protein.Name"]] %in% overlap) s <- with(d, sheet3)[selected,c("Protein.Name","Ensembl.ID","Sentinel.Variant")] rsid <- levels(as.factor(s$Sentinel.Variant)) ps <- phenoscanner::phenoscanner(snpquery=rsid, catalogue="pQTL") snps <- with(ps,snps) print(sort(as.numeric(levels(with(snps,chr))))) tromso <- merge(s,snps,by.x="Sentinel.Variant",by.y="snp") write.table(tromso[c("Sentinel.Variant","Protein.Name","chr")], file="tromso/tromso.txt",col.names=FALSE,row.names=FALSE,quote=FALSE) prot <- pheno_protein[c("caprion_id","affymetrix_gwasqc_bl",overlap)] prot <- within(prot,{for(i in names(prot[,-(1:2)])) assign(paste0(i,"_invn"),gap::invnormal(prot[i]))}) prot <- prot[,-ncol(prot)] id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,prot[,-1], by.x="ID_1",by.y="affymetrix_gwasqc_bl") m <- merge(id1_id2_missing[,-3],id1_id2_missing_covariates_phenotypes,by=c("ID_1","ID_2"),all.x=TRUE) gap::snptest_sample(m,"tromso/tromso.sample", C=c("age","bmi",paste0("PC",1:20)), D="sex", P=names(prot[,-(1:2)])) } peptides_sample <- function(select="SomaLogic") # analysis on peptides { p <- "ERAP2" d <- extract_peptide("ERAP2") id <- pheno_protein[,1:2] peptides <- merge(id,d,by="caprion_id") peptides <- within(peptides,{for(i in names(peptides[,-(1:2)])) assign(paste0(i,"_invn"),gap::invnormal(peptides[i]))}) peptides <- peptides[,-ncol(peptides)] id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,peptides[,-1], by.x="ID_1",by.y="affymetrix_gwasqc_bl") gap::snptest_sample(id1_id2_missing_covariates_phenotypes,paste0("ERAP2/",p,".sample"), C=c("age","bmi",paste0("PC",1:20)), D="sex", P=names(peptides)[-(1:2)]) } affymetrix <- function(select) { # phenotypes names(Samples) <- c("caprion_id","external_id","comment") phenotypes <- read.delim("interval_caprion_pilot_samples_phenotype_data.tsv",as.is=TRUE) phenotypes <- within(phenotypes,{ age <- agepulse sex <- sexpulse bmi <- wt_bl/ht_bl/ht_bl crp <- crp_bl transf <- transf_bl }) affymetrix.id <- with(phenotypes,affymetrix_gwasqc_bl) write.table(affymetrix.id[!is.na(affymetrix.id)],file="affymetrix.id",row.names=FALSE,col.names=FALSE,quote=FALSE) pd <- merge(phenotypes[c("caprion_id","affymetrix_gwasqc_bl","sex","age","bmi","crp","transf")],Samples,by="caprion_id") td <- Protein_All_Peptides rownames(td) <- gsub("_HUMAN","",td[,1]) pap <- merge(protein_list[c("Protein","Accession","Gene")], td, by="Protein") t1 <- t(td[,-1]) td <- data.frame(caprion_id=row.names(t1),data.frame(t1)) uniprot <- merge(protein_list[c("Protein","Accession")],Protein_All_Peptides,by="Protein") rownames(uniprot) <- uniprot[["Accession"]] cat(uniprot[["Accession"]], file="caprion.uniprot") d <- Protein_All_Peptides[,-1] # proteins library(biomaRt) ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="grch37.ensembl.org", path="/biomart/martservice") attr <- listAttributes(ensembl) g <- c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'description', 'hgnc_symbol') t <- c('ensembl_transcript_id', 'transcription_start_site', 'transcript_start', 'transcript_end') gtu <- getBM(attributes = c(g,t,"uniprotswissprot"), mart = ensembl) under_score <- with(gtu,grep("_",chromosome_name)) selected <- setdiff(1:nrow(gtu),under_score) ap <- merge(Annotations[,1:3],Protein_DR_Filt_Peptides,by="Protein") rownames(ap) <- ap[,2] tap <- data.frame(caprion_id=names(ap)[-(1:3)],t(ap[,-(1:3)])) idp <- subset(merge(pd[,1:2],tap,by="caprion_id"),!is.na(affymetrix_gwasqc_bl)) ord <- with(idp,order(affymetrix_gwasqc_bl)) m <- idp[ord,] rownames(m) <- m[["affymetrix_gwasqc_bl"]] mm <- data.frame(Accession=names(m)[-(1:2)],t(m[,-(1:2)])) bed <- merge(gtu[selected,c("chromosome_name","start_position","end_position","uniprotswissprot","ensembl_gene_id")],mm, by.y="Accession",by.x="uniprotswissprot") header <- names(bed)[-1] header[1:4] <- c("#chr","start","end","gene_id") cat(header,file="data/caprion.bed",sep="\t") cat("\n",file="data/caprion.bed",append=TRUE) bed <- within(bed,{chr <- as.numeric(chromosome_name)}) ord <- with(bed,order(chr,start_position,end_position)) reset <- with(bed,is.na(chromosome_name)) bed[reset,5] <- bed[reset, 1] bed[reset,2] <- 99+1:nrow(bed[reset,]) bed[reset,3] <- 1 bed[reset,4] <- 2 write.table(bed[ord,-c(1,ncol(bed))],file="data/caprion.bed",append=TRUE,col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t") # SNPTEST id1_id2_0 <- read.table("interval.samples",skip=2,col.names=c("ID_1","ID_2","missing")) missing <- read.table("data/merged_imputation.missing",col.names=c("affymetrix_gwasqc_bl","missing")) id1_id2_missing <- merge(id1_id2_0[,-3],missing,by.x="ID_1",by.y="affymetrix_gwasqc_bl") eigenvec <- read.delim("data/merged_imputation.eigenvec") if (select=="SomaLogic") { pheno_protein <- merge(pd,td,by="caprion_id") df <- pheno_protein[,-(1:8)] sample.file <- "SomaLogic.sample" uniprot <- scan("SomaLogic.uniprot",what="") SomaLogic <- subset(pap,Accession%in%uniprot) d1 <- t(SomaLogic[,-c(1:3)]) d1 <- data.frame(caprion_id=rownames(d1),round(d1,3)) protein <- merge(phenotypes[c("caprion_id","affymetrix_gwasqc_bl")],d1,by="caprion_id") p <- protein[,-1] p <- within(p,{for(i in names(p)[-1]) assign(paste0(i,"_invn"), gap::invnormal(p[i]))}) p <- p[,-ncol(p)] covariates <- merge(pheno_protein[c("affymetrix_gwasqc_bl","sex","age","bmi")],eigenvec[,-1], by.x="affymetrix_gwasqc_bl",by.y="IID") id1_id2_missing_covariates <- merge(id1_id2_missing,covariates,by.x="ID_1",by.y="affymetrix_gwasqc_bl") id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,p, by.x="ID_1",by.y="affymetrix_gwasqc_bl") expr <- id1_id2_missing_covariates[,-(1:3)] rownames(expr) <- id1_id2_missing_covariates[,1] write.table(t(expr),"data/1.covariates.txt",col.names=TRUE,quote=FALSE,sep="\t") } else { sample.file <- "data/caprion.sample" tu <- t(uniprot[,-(1:2)]) td <- data.frame(caprion_id=rownames(tu),tu) pheno_protein <- merge(pd,td,by="caprion_id") p <- pheno_protein[,-c(1,3:9)] p <- within(p,{for(i in names(p)[-1]) assign(paste0(i,"_invn"), gap::invnormal(p[i]))}) p <- p[,-ncol(p)] covariates <- merge(pheno_protein[c("affymetrix_gwasqc_bl","sex","age","bmi")],eigenvec[,-1], by.x="affymetrix_gwasqc_bl",by.y="IID") id1_id2_missing_covariates <- merge(id1_id2_missing,covariates,by.x="ID_1",by.y="affymetrix_gwasqc_bl") id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,p, by.x="ID_1",by.y="affymetrix_gwasqc_bl") } gap::snptest_sample(id1_id2_missing_covariates_phenotypes,sample.file, C=c("age","bmi",paste0("PC",1:20)), D="sex", P=names(p)[-1]) # exclusion excl_id <- read.table("11-3.id", as.is=TRUE, header=TRUE) eleven <- excl_id[1:11,1] three <- excl_id[12:14,1] idx11 <- colnames(d)%in%eleven idx3 <- colnames(d)%in%three group <- rep(1,ncol(d)) group[idx11] <- 2 group[idx3] <- 3 col.group=c("black","blue","red") } # awk 'a[$0]++<1' caprion.bed | awk -vOFS="\t" '{if(NR==1) gsub(/X/,"",$0); else $1="chr" $1};1' | gzip -f > data/caprion.expression.bed.gz