# 9-3-2020 JHZ UniProt <- function() { library(UniProt.ws) x <- UniProt.ws(taxId=9606) columns(x) keytypes(x) keys <- with(hgTables,entrezGene) columns <- c("PDB","GENES","HGNC","PROTEIN-NAMES","UNIPROTKB") keytypes <- c("ENTREZ_GENE") UniProt.info <- select(x,keys,columns,keytypes) save(UniProt.info,file="UniProt.info.rda") library(org.Hs.eg.db) columns(org.Hs.eg.db) s <- select(org.Hs.eg.db,keys,c("UNIPROT","ENSEMBL","ENSEMBLPROT","SYMBOL","ENSEMBLTRANS")) require(hgu95av2.db) x <- hgu95av2UNIPROT mapped_genes <- mappedkeys(x) xx <- as.list(x[mapped_genes]) if(length(xx) > 0) xx[1:5] } swath_ms_data <- function() { swath_pheno <- read.csv("INTERVALdata_13FEB2020.csv",as.is=TRUE) idmap <- read.csv("omicsMap.csv",as.is=TRUE) vars <- c("Affymetrix_gwasQC_bl","caprion_id","swathMS_id","sexPulse","agePulse","ht_bl","wt_bl","CRP_bl","FERR_bl","TRANSFS_bl","TRANSF_bl","HEP_bl") swath_pheno <- within(merge(idmap,swath_pheno,by="identifier")[vars],{sex=sexPulse;age=agePulse;bmi=wt_bl/ht_bl/ht_bl}) # UCSC Uniprot IDs hgTables <- read.delim("hgTables.gz",quote="") load(paste(Sys.getenv("INF"),"Caprion","caprion.rda",sep="/")) m <- merge(Protein_All_Peptides,protein_list,by="Protein") id1 <- with(m,Accession) library(openxlsx) rds.xlsx <- "DIINT01_BATCH_1_MS_BATCH_INPUT_RDS_02DEC19 final.xlsx" protein_matrix.file <- "DIINT_protein_matrix final.xlsx" rds <- read.xlsx(rds.xlsx, sheet = 1, startRow = 1) protein_matrix <- read.xlsx(protein_matrix.file, sheet = 1, startRow = 1) id <- protein_matrix[-1,"Internal.ID"] o <-id[!id%in%with(hgTables,acc)] f <- function(i) {index <- grep(o[i],hgTables$accList); cat(index,o[i],"\n"); as.character(hgTables$accList[index])} for(i in 1:6) print(f(i)) # P01621, P01771, P01776, P01781, P08107, P62158 # P01619, P01772, P01764, P01780, P08107, P62158 # P08107 and P62158 were obsolete on May 27 and 10, 2015, respectively r <- c("P01619", "P01772", "P01764", "P01780", "P08107", "P62158") protein_matrix <- within(protein_matrix,{Internal.ID[Internal.ID%in%o] <- r}) p <- t(protein_matrix[-1,-1]) colnames(p) <- protein_matrix[-1,"Internal.ID"] rownames(p) <- protein_matrix[1,-1] head(p[,colnames(p)%in%o]) swath_protein <- data.frame(Internal.ID=colnames(protein_matrix)[-1],External.ID=rownames(p), p) swath_protein <-merge(swath_pheno[c("Affymetrix_gwasQC_bl","swathMS_id")],swath_protein[,-1],by.x="swathMS_id",by.y="External.ID") ord <- with(swath_protein,order(Affymetrix_gwasQC_bl)) swath_protein <- swath_protein[ord,] hgTables_vars <- c("acc","accList","uniprotName","ensGene","geneName") # under_score <- with(hgTables,grep("_",X.chrom)) # selected <- setdiff(1:nrow(hgTables),under_score) # m <- merge(protein_matrix[-1,],hgTables[selected,c("X.chrom","chromStart","chromEnd","name")],by.x="Internal.ID",by.y="name") annotated_protein_matrix <- merge(data.frame(Accession=protein_matrix[-1,1]),hgTables[hgTables_vars],by.x="Accession",by.y="acc") # biomart annotation library(biomaRt) ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="grch37.ensembl.org", path="/biomart/martservice") attr <- listAttributes(ensembl) g <- c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'description', 'hgnc_symbol') t <- c('ensembl_transcript_id', 'transcription_start_site', 'transcript_start', 'transcript_end') gtu <- getBM(attributes = c(g,t,"uniprotswissprot"), mart = ensembl) under_score <- with(gtu,grep("_",chromosome_name)) selected <- setdiff(1:nrow(gtu),under_score) p <- swath_protein[,-(1:2)] rownames(p) <- swath_protein[,1] tp <- data.frame(uniprot=colnames(p),t(p)) bed <- merge(gtu[selected,c("chromosome_name","start_position","end_position","uniprotswissprot","ensembl_gene_id")],tp, by.y="uniprot",by.x="uniprotswissprot") header <- names(bed)[-5] header[1:4] <- c("#chr","start","end","gene_id") cat(header,file="swath-ms.bed",sep="\t") cat("\n",file="swath-ms.bed",append=TRUE) bed <- within(bed,{chr <- as.numeric(chromosome_name)}) ord <- with(bed,order(chr,start_position,end_position)) reset <- with(bed,is.na(chromosome_name)) bed[reset,5] <- bed[reset, 1] bed[reset,2] <- 99+1:nrow(bed[reset,]) bed[reset,3] <- 1 bed[reset,4] <- 2 write.table(bed[ord,-c(1,ncol(bed))],file="swath-ms.bed",append=TRUE,col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t") save(hgTables,annotated_protein_matrix,protein_matrix,swath_protein,swath_pheno,bed,rds,file="swath-ms.rda") } # awk 'a[$0]++<1' swath-ms.bed | awk -vOFS="\t" '{if(NR==1) gsub(/X/,"",$0); else $1="chr" $1};1' | gzip -f > swath-ms.expression.bed.gz plotfun <- function(col) { d <- df[,col] xlab <- "Individual" ylab <- colnames(df)[col] plot(d, xlab=xlab, ylab = ylab, main=ylab, type = "p", cex=0.6) hist(d, xlab=ylab, main="") boxplot(d, horizontal=TRUE, cex=0.6) } regfun <- function(col) { p <- df[,col] l <- lm(p~sex,data=pheno_protein) s <- summary(l) c <- with(s,coefficients) r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t") cat(r,"\n",append=TRUE, file="sex.tsv", sep="") l <- lm(p~age+sex+bmi,data=pheno_protein) s <- summary(l) c <- with(s,coefficients) r <- paste(colnames(df)[col],paste(s$coefficients[,4],collapse="\t"),sep="\t") cat(r,"\n",append=TRUE, file="lm.tsv", sep="") } minmax <- function(x) (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) ae_swath <- function(x,hidden.layers=c(987,197,987)) { library(ggplot2) library(plotly) library(keras) x_train <- apply(x, 2, minmax) x_train <- as.matrix(x_train) model <- keras_model_sequential() model %>% layer_dense(units = hidden.layers[1], activation = "tanh", input_shape = ncol(x_train)) %>% layer_dense(units = hidden.layers[2], activation = "tanh", name = "bottleneck") %>% layer_dense(units = hidden.layers[3], activation = "tanh") %>% layer_dense(units = ncol(x_train)) summary(model) model %>% compile(loss = "mean_squared_error", optimizer = "adam") model %>% fit(x = x_train, y = x_train, epochs = 2000, verbose = 0) mse.ae2 <- evaluate(model, x_train, x_train) print(mse.ae2) intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output) intermediate_output <- predict(intermediate_layer_model, x_train) pred <- model %>% predict(x_train) (x_train-pred)^2 } swath_overlap <- function() { id2 <- data.frame(Accession=protein_matrix[-1,1]) dir <- paste(Sys.getenv("INF"),"Caprion",sep="/") source(paste(dir,"utils/olink.inc",sep="/")) tmp <- read.delim(paste(dir,"inf1.tmp",sep="/"),as.is=TRUE,col.names=c("prot","uniprot")) olink_inf1 <- read.delim(paste(dir,"olink.inf.panel.annot.tsv",sep="/"), as.is=TRUE)[c("target","target.short","uniprot","panel","hgnc_symbol")] inf1 <- merge(tmp,olink_inf1,by="uniprot") swath_chk <- merge(inf1,id2,by.x="uniprot",by.y="Accession") write.csv(swath_chk,file="swath_inf1.chk",row.names=FALSE,quote=FALSE) xlsx <- paste(dir,"Olink validation data all panels.xlsx",sep="/") tabs <- c("CVD II","CVD III","Inflammation","Neurology") olink_panel(xlsx,tabs,TRUE,92,FALSE) swath_inf1 <- merge(id2,Inflammation[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.") write.csv(swath_inf1,file="swath_inf1.csv",row.names=FALSE,quote=FALSE) swath_cvd2 <- merge(id2,CVD_II[c("Target","UniProt.No.")],by.x="Accession",by.y="UniProt.No.") write.csv(swath_cvd2,file="swath_cvd2.csv",row.names=FALSE,quote=FALSE) swath_cvd3 <- merge(id2,CVD_III[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No") write.csv(swath_cvd3,file="swath_cvd3.csv",row.names=FALSE,quote=FALSE) swath_neurology <- merge(id2,Neurology[c("Target","UniProt.No")],by.x="Accession",by.y="UniProt.No") write.csv(swath_neurology,file="swath_neurology.csv",row.names=FALSE,quote=FALSE) somalogic <- read.delim(paste(dir,"SOMALOGIC_Master_Table_160410_1129info.tsv",sep="/"),as.is=TRUE) swath_somalogic <- merge(id2,subset(somalogic,UniProt!="NA")[c("Target","UniProt")],by.x="Accession",by.y="UniProt") write.csv(unique(swath_somalogic),file="swath_somalogic.csv",row.names=FALSE,quote=FALSE) swath_caprion <- merge(id2,protein_list[c("Protein","Accession","Gene")],by="Accession") write.csv(unique(swath_caprion),file="swath_caprion.csv",row.names=FALSE,quote=FALSE) library(reshape) Inflammation <- rename(Inflammation,c(UniProt.No.="UniProt.No")) CVD_II <- rename(CVD_II,c(UniProt.No.="UniProt.No")) Olink <- rbind(Inflammation[c("Target","UniProt.No")],CVD_II[c("Target","UniProt.No")], CVD_III[c("Target","UniProt.No")],Neurology[c("Target","UniProt.No")]) Olink <- within(subset(Olink,UniProt.No!="NA"), {OlinkID=UniProt.No; OlinkTarget=Target}) SomaLogic <- within(subset(somalogic,UniProt!="NA"),{SomaLogicID=UniProt;SomaLogicTarget=Target}) swath_olink <- merge(id2,Olink[c("UniProt.No","OlinkID","OlinkTarget")],by.x="Accession",by.y="UniProt.No") swath_olink_somalogic <- merge(swath_olink,SomaLogic[c("SomaLogicTarget","UniProt","SomaLogicID")],by.x="Accession",by.y="UniProt") swath_olink_somalogic_caprion <- merge(swath_olink_somalogic,protein_list[c("Protein","Accession","Gene")],by="Accession") save(swath_olink_somalogic_caprion,file="swath-ms-overlap.rda") library(VennDiagram) plist <- list(id2[["Accession"]],protein_list[["Accession"]],Olink[["UniProt.No"]],SomaLogic[["UniProt"]]) cnames=c("SWATH-MS", "Caprion", "Olink", "SomaLogic") venn.diagram(x = plist, category.names=cnames, filename='4_way_venn_diagram.png', imagetype="png", output=TRUE) } affymetrix <- function(select) { affymetrix.id <- with(subset(swath_pheno,!is.na(swathMS_id)),Affymetrix_gwasQC_bl) write.table(affymetrix.id[!is.na(affymetrix.id)],file="affymetrix.id",row.names=FALSE,col.names=FALSE,quote=FALSE) id1_id2_0 <- read.table(paste(Caprion,"interval.samples",sep="/"),skip=2,col.names=c("ID_1","ID_2","missing")) missing <- read.table(paste(Caprion,"merged_imputation.missing",sep="/"),col.names=c("affymetrix_gwasqc_bl","missing")) id1_id2_missing <- merge(id1_id2_0[,-3],missing,by.x="ID_1",by.y="affymetrix_gwasqc_bl") eigenvec <- read.delim(paste(Caprion,"merged_imputation.eigenvec",sep="/")) covariates <- merge(swath_pheno[c("Affymetrix_gwasQC_bl","sex","age","bmi")],eigenvec[,-1], by.x="Affymetrix_gwasQC_bl",by.y="IID") id1_id2_missing_covariates <- subset(merge(id1_id2_missing,covariates,by.x="ID_1",by.y="Affymetrix_gwasQC_bl"),ID_1%in%affymetrix.id) expr <- id1_id2_missing_covariates[,-(1:3)] rownames(expr) <- id1_id2_missing_covariates[,1] write.table(t(expr),"1.covariates.txt",col.names=TRUE,quote=FALSE,sep="\t") uniprot <- names(swath_protein[,-c(1:2)]) cat(uniprot, file="swath-ms.uniprot") if (select=="SomaLogic") { sample.file <- "SomaLogic.sample" uniprot <- scan(paste(Caprion,"SomaLogic.uniprot",sep="/"),what="") SomaLogic <- subset(protein_matrix[-1,],Internal.ID%in%uniprot) names(SomaLogic) <- protein_matrix[1,] rownames(SomaLogic) <- SomaLogic[,1] d1 <- t(SomaLogic[,-1]) d1 <- data.frame(swath_id=rownames(d1),round(d1,3)) protein <- merge(swath_pheno[c("swathMS_id","Affymetrix_gwasQC_bl")],d1,by.x="swathMS_id",by.y="swath_id") p <- protein[,-1] } else { sample.file <- "swath-ms.sample" p <- swath_protein[,-1] } p <- within(p,{for(i in names(p)[-1]) assign(paste0(i,"_invn"), gap::invnormal(p[i]))}) p <- p[,-ncol(p)] id1_id2_missing_covariates <- merge(id1_id2_missing,covariates,by.x="ID_1",by.y="Affymetrix_gwasQC_bl") id1_id2_missing_covariates_phenotypes <- merge(id1_id2_missing_covariates,p, by.x="ID_1",by.y="Affymetrix_gwasQC_bl") gap::snptest_sample(id1_id2_missing_covariates_phenotypes,sample.file, C=c("age","bmi",paste0("PC",1:20)), D="sex", P=names(p)[-1]) }