es_pkgs <- c("Biobase", "arrayQualityMetrics", "dplyr", "knitr", "mclust", "pQTLtools", "rgl", "scatterplot3d")
se_pkgs <- c("BiocGenerics","GenomicRanges","IRanges","MsCoreUtils","S4Vectors","SummarizedExperiment")
pkgs <- c(es_pkgs,se_pkgs)
for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) {
if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install"))
}
invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))
1 ExpressionSet
We start with Bioconductor/Biobase’s ExpressionSet example and finish with a real application.
dataDirectory <- system.file("extdata", package="Biobase")
exprsFile <- file.path(dataDirectory, "exprsData.txt")
exprs <- as.matrix(read.table(exprsFile, header=TRUE, sep="\t", row.names=1, as.is=TRUE))
pDataFile <- file.path(dataDirectory, "pData.txt")
pData <- read.table(pDataFile, row.names=1, header=TRUE, sep="\t")
all(rownames(pData)==colnames(exprs))
metadata <- data.frame(labelDescription=c("Patient gender",
"Case/control status",
"Tumor progress on XYZ scale"),
row.names=c("gender", "type", "score"))
phenoData <- Biobase::AnnotatedDataFrame(data=pData, varMetadata=metadata)
experimentData <- Biobase::MIAME(name="Pierre Fermat",
lab="Francis Galton Lab",
contact="pfermat@lab.not.exist",
title="Smoking-Cancer Experiment",
abstract="An example ExpressionSet",
url="www.lab.not.exist",
other=list(notes="Created from text files"))
exampleSet <- pQTLtools::make_ExpressionSet(exprs,phenoData,experimentData=experimentData,
annotation="hgu95av2")
data(sample.ExpressionSet, package="Biobase")
identical(exampleSet,sample.ExpressionSet)
1.1 data.frame
The great benefit is to use the object directly as a data.frame.
lm.result <- Biobase::esApply(exampleSet,1,function(x) lm(score~gender+x))
beta.x <- unlist(lapply(lapply(lm.result,coef),"[",3))
beta.x[1]
#> AFFX-MurIL2_at.x
#> -0.0001907472
lm(score~gender+AFFX.MurIL2_at,data=exampleSet)
#>
#> Call:
#> lm(formula = score ~ gender + AFFX.MurIL2_at, data = exampleSet)
#>
#> Coefficients:
#> (Intercept) genderMale AFFX.MurIL2_at
#> 0.5531725 0.0098932 -0.0001907
1.2 Composite plots
We wish to examine the distribution of each feature via histogram, scatter and boxplot.
One could resort to esApply()
for its simplicity as before
invisible(Biobase::esApply(exampleSet[1:2],1,function(x)
{par(mfrow=c(3,1));boxplot(x);hist(x);plot(x)}
))
but it is nicer to add feature name in the title.
par(mfrow=c(1,3))
f <- Biobase::featureNames(exampleSet[1:2])
invisible(sapply(f,function(x) {
d <- t(Biobase::exprs(exampleSet[x]))
fn <- Biobase::featureNames(exampleSet[x])
hist(d,main="",xlab=fn); plot(d, ylab=fn); boxplot(d,ylab=fn)
}
)
)
where the expression set is indexed using feature name(s).
1.4 Clustering
We employ model-based clustering absed on principal compoents to see potential groupings in the data,
pca <- prcomp(na.omit(t(Biobase::exprs(exampleSet))), rank=10, scale=TRUE)
pc1pc2pc3 <- with(pca,x)[,1:3]
mc <- mclust::Mclust(pc1pc2pc3,G=3)
with(mc, {
cols <- c("blue","red", "purple")
s3d <- scatterplot3d::scatterplot3d(with(pca,x[,c(2,1,3)]),
color=cols[classification],
pch=16,
type="h",
main="Plot of the PC1, PC2 and PC3")
s3d.coords <- s3d$xyz.convert(with(pca,x[,c(2,1,3)]))
text(s3d.coords$x,
s3d.coords$y,
cex = 1.2,
col = cols[classification],
labels = row.names(pc1pc2pc3),
pos = 4)
legend("right", legend=levels(as.factor(classification)), col=cols[classification], pch=16)
rgl::open3d(width = 500, height = 500)
rgl::plot3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],size=5)
rgl::text3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],texts=row.names(pc1pc2pc3))
htmlwidgets::saveWidget(rgl::rglwidget(), file = "mcpca3d.html")
})
#> Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
#> parameter "width" cannot be set
#> Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
#> parameter "height" cannot be set
An interactive version is also available,
1.5 Data transformation
Suppose we wish use log2 for those greater than zero but set those negative values to be missing.
1.6 Limit of detection (LOD)
We generate a lod.max
~ U[0,1] variable to experiment
Biobase::fData(exampleSet)
#> data frame with 0 columns and 500 rows
Biobase::fData(exampleSet)$lod.max <- apply(Biobase::exprs(exampleSet),1,quantile,runif(nrow(exampleSet)))
lod <- pQTLtools::get.prop.below.LLOD(exampleSet)
x <- dplyr::arrange(Biobase::fData(lod),desc(pc.belowLOD.new))
knitr::kable(head(lod))
AFFX.MurIL2_at | AFFX.MurIL10_at | AFFX.MurIL4_at | AFFX.MurFAS_at | AFFX.BioB.5_at | AFFX.BioB.M_at | gender | type | score | |
---|---|---|---|---|---|---|---|---|---|
A | 192.7420 | 97.13700 | 45.81920 | 22.54450 | 96.7875 | 89.0730 | Female | Control | 0.75 |
B | 85.7533 | 126.19600 | 8.83135 | 3.60093 | 30.4380 | 25.8461 | Male | Case | 0.40 |
C | 176.7570 | 77.92160 | 33.06320 | 14.68830 | 46.1271 | 57.2033 | Male | Control | 0.73 |
D | 135.5750 | 93.37130 | 28.70720 | 12.33970 | 70.9319 | 69.9766 | Male | Case | 0.42 |
E | 64.4939 | 24.39860 | 5.94492 | 36.86630 | 56.1744 | 49.5822 | Female | Case | 0.93 |
F | 76.3569 | 85.50880 | 28.29250 | 11.25680 | 42.6756 | 26.1262 | Male | Control | 0.22 |
G | 160.5050 | 98.90860 | 30.96940 | 23.00340 | 86.5156 | 75.0083 | Male | Case | 0.96 |
H | 65.9631 | 81.69320 | 14.79230 | 16.21340 | 30.7927 | 42.3352 | Male | Case | 0.79 |
I | 56.9039 | 97.80150 | 14.23990 | 12.03750 | 19.7183 | 41.1207 | Female | Case | 0.37 |
J | 135.6080 | 90.48380 | 34.48740 | 4.54978 | 46.3520 | 91.5307 | Male | Control | 0.63 |
K | 63.4432 | 70.57330 | 20.35210 | 8.51782 | 39.1326 | 39.9136 | Male | Case | 0.26 |
L | 78.2126 | 94.54180 | 14.15540 | 27.28520 | 41.7698 | 49.8397 | Female | Control | 0.36 |
M | 83.0943 | 75.34550 | 20.62510 | 10.16160 | 80.2197 | 63.4794 | Male | Case | 0.41 |
N | 89.3372 | 68.58270 | 15.92310 | 20.24880 | 36.4903 | 24.7007 | Male | Case | 0.80 |
O | 91.0615 | 87.40500 | 20.15790 | 15.78490 | 36.4021 | 47.4641 | Female | Case | 0.10 |
P | 95.9377 | 84.45810 | 27.81390 | 14.32760 | 35.3054 | 47.3578 | Female | Control | 0.41 |
Q | 179.8450 | 87.68060 | 32.79110 | 15.94880 | 58.6239 | 58.1331 | Female | Case | 0.16 |
R | 152.4670 | 108.03200 | 33.52920 | 14.67530 | 114.0620 | 104.1220 | Male | Control | 0.72 |
S | 180.8340 | 134.26300 | 19.81720 | -7.91911 | 93.4402 | 115.8310 | Male | Case | 0.17 |
T | 85.4146 | 91.40310 | 20.41900 | 12.88750 | 22.5168 | 58.1224 | Female | Case | 0.74 |
U | 157.9890 | -8.68811 | 26.87200 | 11.91860 | 48.6462 | 73.4221 | Male | Control | 0.35 |
V | 146.8000 | 85.02120 | 31.14880 | 12.83240 | 90.2215 | 64.6066 | Female | Control | 0.77 |
W | 93.8829 | 79.29980 | 22.34200 | 11.13900 | 42.0053 | 40.3068 | Male | Control | 0.27 |
X | 103.8550 | 71.65520 | 19.01350 | 7.55564 | 57.5738 | 41.8209 | Male | Control | 0.98 |
Y | 64.4340 | 64.23690 | 12.16860 | 19.98490 | 44.8216 | 46.1087 | Female | Case | 0.94 |
Z | 175.6150 | 78.70680 | 17.37800 | 8.96849 | 61.7044 | 49.4122 | Female | Case | 0.32 |
plot(x[,2], main="Random quantile cutoff", ylab="<lod%")
The quantity has been shown to have a big impact on protein abundance and therefore pQTL detection as is shown with a real example.
rm(list=ls())
dir <- "~/rds/post_qc_data/interval/phenotype/olink_proteomics/post-qc/"
eset <- readRDS(paste0(dir,"eset.inf1.flag.out.outlier.in.rds"))
x <- pQTLtools::get.prop.below.LLOD(eset)
annot <- Biobase::fData(x)
annot$MissDataProp <- as.numeric(gsub("\\%$", "", annot$MissDataProp))
plot(annot$MissDataProp, annot$pc.belowLOD.new, xlab="% <LLOD in Original",
ylab="% <LLOD in post QC dataset", pch=19)
INF <- Sys.getenv("INF")
np <- read.table(paste(INF, "work", "INF1.merge.nosig", sep="/"), header=FALSE,
col.names = c("prot", "uniprot"))
kable(np, caption="Proteins with no pQTL")
annot$pQTL <- rep(NA, nrow(annot))
no.pQTL.ind <- which(annot$uniprot.id %in% np$uniprot)
annot$pQTL[no.pQTL.ind] <- "red"
annot$pQTL[-no.pQTL.ind] <- "blue"
annot <- annot[order(annot$pc.belowLOD.new, decreasing = TRUE),]
annot <- annot[-grep("^BDNF$", annot$ID),]
saveRDS(annot,file=file.path("~","pQTLtools","tests","annot.RDS"))
annot <- readRDS(file.path(find.package("pQTLtools"),"tests","annot.RDS")) %>%
dplyr::left_join(pQTLdata::inf1[c("prot","target.short","alt_name")],by=c("ID"="prot")) %>%
dplyr::mutate(prot=if_else(is.na(alt_name),target.short,alt_name),order=1:n()) %>%
dplyr::arrange(desc(order))
xtick <- seq(1, nrow(annot))
attach(annot)
par(mar=c(10,5,1,1))
plot(100-pc.belowLOD.new,cex=2,pch=19,col=pQTL,xaxt="n",xlab="",ylab="",cex.axis=0.8)
text(66,16,"IL-17C",offset=0,pos=2,cex=1.5,font=2,srt=0)
arrows(67,16,71,16,lwd=2)
axis(1, at=xtick, labels=prot, lwd.tick=0.5, lwd=0, las=2, hadj=1, cex.axis=0.8)
mtext("% samples above LLOD",side=2,line=2.5,cex=1.2)
mtext("Ordered protein",side=1,line=6.5,cex=1.2,font=1)
legend(x=1,y=25,c("without pQTL","with pQTL"),box.lwd=0,cex=2,col=c("red","blue"),pch=19)
ID | dichot | olink.id | uniprot.id | lod.max | MissDataProp | pc.belowLOD.new | pQTL | target.short | alt_name | prot | order |
---|---|---|---|---|---|---|---|---|---|---|---|
CSF.1 | FALSE | 196_CSF-1 | P09603 | 1.02 | 0.02 | 0.0000000 | blue | CSF-1 | NA | CSF-1 | 91 |
TNFB | FALSE | 195_TNFB | P01374 | 1.10 | 0.04 | 0.0000000 | blue | TNFB | NA | TNFB | 90 |
ADA | FALSE | 194_ADA | P00813 | 1.80 | 0.04 | 0.0000000 | blue | ADA | NA | ADA | 89 |
STAMPB | FALSE | 192_STAMPB | O95630 | 1.40 | 0.06 | 0.0000000 | red | STAMPB | NA | STAMPB | 88 |
CCL20 | FALSE | 190_CCL20 | P78556 | 1.42 | 0.02 | 0.0000000 | blue | CCL20 | NA | CCL20 | 87 |
TWEAK | FALSE | 189_TWEAK | O43508 | 1.78 | 0.02 | 0.0000000 | blue | TWEAK | NA | TWEAK | 86 |
TNFRSF9 | FALSE | 187_TNFRSF9 | Q07011 | 1.86 | 0.02 | 0.0000000 | blue | TNFRSF9 | NA | TNFRSF9 | 85 |
CX3CL1 | FALSE | 186_CX3CL1 | P78423 | 1.76 | 0.02 | 0.0000000 | blue | CX3CL1 | NA | CX3CL1 | 84 |
CCL25 | FALSE | 185_CCL25 | O15444 | 1.32 | 0.02 | 0.0000000 | blue | CCL25 | NA | CCL25 | 83 |
CASP.8 | FALSE | 184_CASP-8 | Q14790 | 1.38 | 0.04 | 0.0000000 | blue | CASP-8 | NA | CASP-8 | 82 |
MCP.2 | FALSE | 183_MCP-2 | P80075 | 2.10 | 0.02 | 0.0000000 | blue | MCP-2 | CCL8 | CCL8 | 81 |
FGF.19 | FALSE | 179_FGF-19 | O95750 | 1.25 | 0.02 | 0.0000000 | blue | FGF-19 | NA | FGF-19 | 80 |
CD40 | FALSE | 176_CD40 | P25942 | 1.91 | 0.04 | 0.0000000 | blue | CD40 | NA | CD40 | 79 |
DNER | FALSE | 174_DNER | Q8NFT8 | 1.63 | 0.02 | 0.0000000 | blue | DNER | NA | DNER | 78 |
4E.BP1 | FALSE | 170_4E-BP1 | Q13541 | 1.04 | 0.10 | 0.0000000 | blue | 4EBP1 | NA | 4EBP1 | 77 |
CXCL10 | FALSE | 169_CXCL10 | P02778 | 2.03 | 0.02 | 0.0000000 | blue | CXCL10 | NA | CXCL10 | 76 |
CXCL6 | FALSE | 168_CXCL6 | P80162 | 1.80 | 0.02 | 0.0000000 | blue | CXCL6 | CXCL6 | CXCL6 | 75 |
Flt3L | FALSE | 167_Flt3L | P49771 | 2.12 | 0.02 | 0.0000000 | blue | FIt3L | NA | FIt3L | 74 |
CD5 | FALSE | 165_CD5 | P06127 | 1.42 | 0.02 | 0.0000000 | blue | CD5 | NA | CD5 | 73 |
CCL23 | FALSE | 164_CCL23 | P55773 | 2.10 | 0.02 | 0.0000000 | blue | CCL23 | NA | CCL23 | 72 |
MMP.10 | FALSE | 161_MMP-10 | P09238 | 1.56 | 0.02 | 0.0000000 | blue | MMP-10 | NA | MMP-10 | 71 |
IL.12B | FALSE | 157_IL-12B | P29460 | 0.85 | 0.02 | 0.0000000 | blue | IL-12B | NA | IL-12B | 70 |
HGF | FALSE | 156_HGF | P14210 | 1.77 | 0.02 | 0.0000000 | blue | HGF | NA | HGF | 69 |
TRANCE | FALSE | 155_TRANCE | O14788 | 1.51 | 0.04 | 0.0000000 | blue | TRANCE | NA | TRANCE | 68 |
CXCL5 | FALSE | 154_CXCL5 | P42830 | 1.92 | 0.02 | 0.0000000 | blue | CXCL5 | NA | CXCL5 | 67 |
PD.L1 | FALSE | 152_PD-L1 | Q9NZQ7 | 2.14 | 0.04 | 0.0000000 | blue | PD-L1 | NA | PD-L1 | 66 |
IL.18R1 | FALSE | 151_IL-18R1 | Q13478 | 1.21 | 0.02 | 0.0000000 | blue | IL-18R1 | NA | IL-18R1 | 65 |
IL.10RB | FALSE | 149_IL-10RB | Q08334 | 1.72 | 0.02 | 0.0000000 | blue | IL10RB | NA | IL10RB | 64 |
CCL19 | FALSE | 145_CCL19 | Q99731 | 2.13 | 0.02 | 0.0000000 | blue | CCL19 | NA | CCL19 | 63 |
FGF.21 | FALSE | 144_FGF-21 | Q9NSA1 | 1.14 | 0.02 | 0.0000000 | blue | FGF-21 | NA | FGF-21 | 62 |
LIF.R | FALSE | 143_LIF-R | P42702 | 2.08 | 0.04 | 0.0000000 | blue | LIF-R | NA | LIF-R | 61 |
FGF.23 | FALSE | 139_FGF-23 | Q9GZV9 | 0.66 | 0.04 | 0.0000000 | blue | FGF-23 | NA | FGF-23 | 60 |
TNFSF14 | FALSE | 138_TNFSF14 | O43557 | 1.81 | 0.02 | 0.0000000 | blue | TNFSF14 | NA | TNFSF14 | 59 |
CCL11 | FALSE | 137_CCL11 | P51671 | 2.09 | 0.02 | 0.0000000 | blue | CCL11 | CCL11 | CCL11 | 58 |
MCP.4 | FALSE | 136_MCP-4 | Q99616 | 0.56 | 0.04 | 0.0000000 | blue | MCP-4 | CCL13 | CCL13 | 57 |
TGF.alpha | FALSE | 135_TGF-alpha | P01135 | 0.07 | 0.02 | 0.0000000 | blue | TGF-alpha | NA | TGF-alpha | 56 |
IL.18 | FALSE | 133_IL-18 | Q14116 | 2.79 | 0.02 | 0.0000000 | blue | IL-18 | NA | IL-18 | 55 |
SCF | FALSE | 132_SCF | P21583 | 2.30 | 0.02 | 0.0000000 | blue | SCF | NA | SCF | 54 |
CD6 | FALSE | 131_CD6 | Q8WWJ7 | 1.78 | 0.04 | 0.0000000 | blue | CD6 | NA | CD6 | 53 |
CCL4 | FALSE | 130_CCL4 | P13236 | 2.15 | 0.02 | 0.0000000 | blue | CCL4 | NA | CCL4 | 52 |
CXCL1 | FALSE | 128_CXCL1 | P09341 | 3.45 | 0.02 | 0.0000000 | blue | CXCL1 | NA | CXCL1 | 51 |
OSM | FALSE | 126_OSM | P13725 | 1.32 | 0.06 | 0.0000000 | blue | OSM | NA | OSM | 50 |
CST5 | FALSE | 123_CST5 | P28325 | 4.34 | 0.04 | 0.0000000 | blue | CST5 | NA | CST5 | 49 |
CXCL9 | FALSE | 122_CXCL9 | Q07325 | 1.04 | 0.02 | 0.0000000 | blue | CXCL9 | NA | CXCL9 | 48 |
TRAIL | FALSE | 120_TRAIL | P50591 | 1.22 | 0.02 | 0.0000000 | blue | TRAIL | NA | TRAIL | 47 |
CXCL11 | FALSE | 117_CXCL11 | O14625 | 1.40 | 0.02 | 0.0000000 | blue | CXCL11 | NA | CXCL11 | 46 |
MCP.1 | FALSE | 115_MCP-1 | P13500 | 1.90 | 0.02 | 0.0000000 | blue | MCP-1 | CCL2 | CCL2 | 45 |
uPA | FALSE | 112_uPA | P00749 | 1.75 | 0.02 | 0.0000000 | blue | uPA | NA | uPA | 44 |
LAP.TGF.beta.1 | FALSE | 111_LAP TGF-beta-1 | P01137 | 0.86 | 0.02 | 0.0000000 | blue | LAP TGF-beta-1 | NA | LAP TGF-beta-1 | 43 |
OPG | FALSE | 110_OPG | O00300 | 1.81 | 0.02 | 0.0000000 | blue | OPG | NA | OPG | 42 |
CD244 | FALSE | 108_CD244 | Q9BZW8 | 1.09 | 0.02 | 0.0000000 | blue | CD244 | NA | CD244 | 41 |
CDCP1 | FALSE | 107_CDCP1 | Q9H5V8 | 0.21 | 0.04 | 0.0000000 | blue | CDCP1 | NA | CDCP1 | 40 |
VEGF.A | FALSE | 102_VEGF-A | P15692 | 3.15 | 0.02 | 0.0000000 | blue | VEGF_A | NA | VEGF_A | 39 |
MIP.1.alpha | FALSE | 166_MIP-1 alpha | P10147 | 1.92 | 0.06 | 0.0203998 | blue | MIP-1 alpha | NA | MIP-1 alpha | 38 |
MMP.1 | FALSE | 142_MMP-1 | P03956 | 1.89 | 0.06 | 0.0407997 | blue | MMP-1 | NA | MMP-1 | 37 |
SIRT2 | FALSE | 172_SIRT2 | Q8IXJ6 | 1.52 | 1.00 | 0.0815993 | blue | SIRT2 | NA | SIRT2 | 36 |
IL.8 | FALSE | 101_IL-8 | P10145 | 3.17 | 0.25 | 0.2447980 | blue | IL-8 | NA | IL-8 | 35 |
IL.7 | FALSE | 109_IL-7 | P13232 | 1.39 | 1.43 | 0.7547940 | blue | IL-7 | NA | IL-7 | 34 |
SLAMF1 | FALSE | 134_SLAMF1 | Q13291 | 1.55 | 2.13 | 1.5299878 | blue | SLAMF1 | NA | SLAMF1 | 33 |
EN.RAGE | FALSE | 175_EN-RAGE | P80511 | 1.57 | 3.28 | 3.5291718 | blue | EN-RAGE | NA | EN-RAGE | 32 |
Beta.NGF | FALSE | 153_Beta-NGF | P01138 | 1.23 | 3.96 | 4.2227662 | blue | Beta-NGF | NA | Beta-NGF | 31 |
CCL28 | FALSE | 173_CCL28 | Q9NRJ3 | 0.92 | 5.49 | 5.2835577 | red | CCL28 | NA | CCL28 | 30 |
IL.10 | FALSE | 162_IL-10 | P22301 | 1.50 | 7.71 | 7.0583435 | blue | IL-10 | NA | IL-10 | 29 |
AXIN1 | FALSE | 118_AXIN1 | O15169 | 1.30 | 13.04 | 10.8731130 | red | AXIN1 | NA | AXIN1 | 28 |
NT.3 | FALSE | 188_NT-3 | P20783 | 0.85 | 12.77 | 13.2394941 | blue | NT-3 | NA | NT-3 | 27 |
FGF.5 | FALSE | 141_FGF-5 | Q8NF90 | 1.00 | 31.71 | 30.3957568 | blue | FGF-5 | NA | FGF-5 | 26 |
ST1A1 | FALSE | 191_ST1A1 | P50225 | 1.34 | 32.68 | 31.2525500 | blue | ST1A1 | NA | ST1A1 | 25 |
GDNF | FALSE | 106_GDNF | P39905 | 1.13 | 43.26 | 42.4520604 | blue | hGDNF | NA | hGDNF | 24 |
IL.10RA | FALSE | 140_IL-10RA | Q13651 | 1.14 | 53.30 | 53.9779682 | red | IL-10RA | NA | IL-10RA | 23 |
IL.6 | FALSE | 113_IL-6 | P05231 | 1.97 | 69.06 | 68.6250510 | blue | IL-6 | NA | IL-6 | 22 |
IL.5 | FALSE | 193_IL-5 | P05113 | 1.55 | 78.21 | 77.8457772 | red | IL-5 | NA | IL-5 | 21 |
MCP.3 | FALSE | 105_MCP-3 | P80098 | 1.31 | 79.30 | 79.0697674 | blue | MCP-3 | CCL7 | CCL7 | 20 |
IL.17C | FALSE | 114_IL-17C | Q9P0M4 | 1.28 | 83.07 | 83.3741330 | blue | IL-17C | NA | IL-17C | 19 |
IL.17A | FALSE | 116_IL-17A | Q16552 | 0.97 | 84.73 | 84.7001224 | red | IL-17A | NA | IL-17A | 18 |
IL.4 | FALSE | 180_IL-4 | P05112 | 1.11 | 93.91 | 94.4104447 | red | IL-4 | NA | IL-4 | 17 |
LIF | FALSE | 181_LIF | P15018 | 1.45 | 94.90 | 94.8184415 | red | LIF | NA | LIF | 16 |
TNF | FALSE | 163_TNF | P01375 | 1.19 | 95.22 | 95.1244390 | red | TNF | NA | TNF | 15 |
IL.20RA | FALSE | 121_IL-20RA | Q9UHF4 | 0.79 | 95.16 | 95.7772338 | red | IL-20RA | NA | IL-20RA | 14 |
ARTN | FALSE | 160_ARTN | Q5T4W7 | 0.73 | 95.84 | 95.9404325 | red | ARTN | NA | ARTN | 13 |
IL.15RA | FALSE | 148_IL-15RA | Q13261 | 1.18 | 96.10 | 96.1648307 | blue | IL-15RA | NA | IL-15RA | 12 |
IL.13 | FALSE | 159_IL-13 | P35225 | 1.71 | 96.51 | 96.5524276 | red | IL-13 | NA | IL-13 | 11 |
IL.20 | FALSE | 171_IL-20 | Q9NYY1 | 1.30 | 96.72 | 96.7156263 | red | IL-20 | NA | IL-20 | 10 |
IL.24 | FALSE | 158_IL-24 | Q13007 | 1.95 | 97.13 | 97.0828233 | red | IL-24 | NA | IL-24 | 9 |
IL.2RB | FALSE | 124_IL-2RB | P14784 | 1.47 | 97.11 | 97.1032232 | red | IL-2RB | NA | IL-2RB | 8 |
NRTN | FALSE | 182_NRTN | Q99748 | 1.39 | 97.11 | 97.1236230 | red | NRTN | NA | NRTN | 7 |
IL.33 | FALSE | 177_IL-33 | O95760 | 1.17 | 98.54 | 98.5516116 | red | IL-33 | NA | IL-33 | 6 |
TSLP | FALSE | 129_TSLP | Q969D9 | 1.80 | 99.49 | 99.4900041 | red | TSLP | NA | TSLP | 5 |
IFN.gamma | FALSE | 178_IFN-gamma | P01579 | 1.40 | 99.53 | 99.5308038 | red | IFN-gamma | NA | IFN-gamma | 4 |
IL.22.RA1 | FALSE | 150_IL-22 RA1 | Q8N6P7 | 1.79 | 99.61 | 99.6124031 | red | IL-22RA1 | NA | IL-22RA1 | 3 |
IL.2 | FALSE | 127_IL-2 | P60568 | 0.93 | 99.86 | 99.8776010 | red | IL-2 | NA | IL-2 | 2 |
IL.1.alpha | FALSE | 125_IL-1 alpha | P01583 | 4.76 | 99.88 | 99.8776010 | blue | IL-1 alpha | NA | IL-1 alpha | 1 |
1.7 maEndtoEnd
Web: https://bioconductor.org/packages/release/workflows/html/maEndToEnd.html.
Examples can be found on PCA, heatmap, normalisation, linear models, and enrichment analysis from this Bioconductor package.
2 SummarizedExperiment
This is a more modern construct. Based on the documentation example, ranged summarized experiment (rse) and imputation are shown below.
set.seed(123)
nrows <- 20
ncols <- 4
counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
missing_indices <- sample(length(counts), size = 5, replace = FALSE)
counts[missing_indices] <- NA
rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(1, 3) * nrows / 4),
IRanges::IRanges(floor(runif(nrows, 1e5, 1e6)), width=ncols),
strand=sample(c("+", "-"), nrows, TRUE),
feature_id=sprintf("ID%03d", 1:nrows))
colData <- S4Vectors::DataFrame(Treatment=rep(c("ChIP", "Input"), ncols/2),
row.names=LETTERS[1:ncols])
rse <- SummarizedExperiment(assays=S4Vectors::SimpleList(counts=counts),
rowRanges=rowRanges, colData=colData)
assay(rse)
#> A B C D
#> [1,] 2876.4876 8895.5036 1428.8574 6651.486831
#> [2,] 7883.2630 6928.3413 4146.0488 949.311769
#> [3,] 4090.3602 6405.4276 4137.8295 3840.312408
#> [4,] 8830.2910 9942.7035 3689.0857 2744.562062
#> [5,] 9404.7324 6557.4023 1525.2950 8146.585749
#> [6,] NA 7085.5962 1388.9218 4485.714898
#> [7,] 5281.5268 5441.1162 2331.1080 8100.833466
#> [8,] 8924.2980 5941.8261 4660.1585 8124.082706
#> [9,] 5514.7987 2892.3082 2660.4604 7943.628869
#> [10,] 4566.6907 1471.9894 NA 4398.877044
#> [11,] 9568.3766 NA 459.2658 7544.997111
#> [12,] 4533.8882 9023.0882 4422.5585 NA
#> [13,] 6776.0288 6907.3621 7989.4495 7102.113831
#> [14,] 5726.7614 7954.8787 1219.8707 7.247108
#> [15,] 1030.1439 247.1122 5609.9189 4753.690424
#> [16,] 8998.3499 4778.4819 2066.1074 2201.968733
#> [17,] 2461.6313 7584.8369 1276.1890 3798.785561
#> [18,] 421.5533 2164.8630 7533.3253 6128.097262
#> [19,] 3279.8793 NA 8950.5585 3518.627295
#> [20,] 9545.0820 2317.0262 3745.2533 1112.243108
imputed <- MsCoreUtils::impute_knn(as.matrix(SummarizedExperiment::assay(rse)),2)
imputed_counts <- MsCoreUtils::impute_RF(as.matrix(assay(rse)),2)
imputed-imputed_counts
#> A B C D
#> [1,] 0.000 0.0000 0.000 0.000
#> [2,] 0.000 0.0000 0.000 0.000
#> [3,] 0.000 0.0000 0.000 0.000
#> [4,] 0.000 0.0000 0.000 0.000
#> [5,] 0.000 0.0000 0.000 0.000
#> [6,] 2559.339 0.0000 0.000 0.000
#> [7,] 0.000 0.0000 0.000 0.000
#> [8,] 0.000 0.0000 0.000 0.000
#> [9,] 0.000 0.0000 0.000 0.000
#> [10,] 0.000 0.0000 1791.909 0.000
#> [11,] 0.000 -709.2786 0.000 0.000
#> [12,] 0.000 0.0000 0.000 2467.614
#> [13,] 0.000 0.0000 0.000 0.000
#> [14,] 0.000 0.0000 0.000 0.000
#> [15,] 0.000 0.0000 0.000 0.000
#> [16,] 0.000 0.0000 0.000 0.000
#> [17,] 0.000 0.0000 0.000 0.000
#> [18,] 0.000 0.0000 0.000 0.000
#> [19,] 0.000 1298.3456 0.000 0.000
#> [20,] 0.000 0.0000 0.000 0.000
SummarizedExperiment::assays(rse) <- S4Vectors::SimpleList(counts=imputed_counts)
SummarizedExperiment::assay(rse)
#> A B C D
#> [1,] 2876.4876 8895.5036 1428.8574 6651.486831
#> [2,] 7883.2630 6928.3413 4146.0488 949.311769
#> [3,] 4090.3602 6405.4276 4137.8295 3840.312408
#> [4,] 8830.2910 9942.7035 3689.0857 2744.562062
#> [5,] 9404.7324 6557.4023 1525.2950 8146.585749
#> [6,] 4439.2947 7085.5962 1388.9218 4485.714898
#> [7,] 5281.5268 5441.1162 2331.1080 8100.833466
#> [8,] 8924.2980 5941.8261 4660.1585 8124.082706
#> [9,] 5514.7987 2892.3082 2660.4604 7943.628869
#> [10,] 4566.6907 1471.9894 4812.3872 4398.877044
#> [11,] 9568.3766 5545.7565 459.2658 7544.997111
#> [12,] 4533.8882 9023.0882 4422.5585 4611.837102
#> [13,] 6776.0288 6907.3621 7989.4495 7102.113831
#> [14,] 5726.7614 7954.8787 1219.8707 7.247108
#> [15,] 1030.1439 247.1122 5609.9189 4753.690424
#> [16,] 8998.3499 4778.4819 2066.1074 2201.968733
#> [17,] 2461.6313 7584.8369 1276.1890 3798.785561
#> [18,] 421.5533 2164.8630 7533.3253 6128.097262
#> [19,] 3279.8793 4566.2289 8950.5585 3518.627295
#> [20,] 9545.0820 2317.0262 3745.2533 1112.243108
SummarizedExperiment::assays(rse) <- S4Vectors::endoapply(SummarizedExperiment::assays(rse), asinh)
SummarizedExperiment::assay(rse)
#> A B C D
#> [1,] 8.657472 9.786448 7.957778 9.495743
#> [2,] 9.665644 9.536523 9.023058 7.548885
#> [3,] 9.009536 9.458048 9.021074 8.946456
#> [4,] 9.779090 9.897741 8.906281 8.610524
#> [5,] 9.842115 9.481497 8.023090 9.698501
#> [6,] 9.091398 9.558966 7.929430 9.101800
#> [7,] 9.265118 9.294887 8.447246 9.692869
#> [8,] 9.789680 9.382919 9.139952 9.695735
#> [9,] 9.308338 8.662957 8.579402 9.673273
#> [10,] 9.119691 7.987517 9.172096 9.082252
#> [11,] 9.859366 9.313936 6.822778 9.621787
#> [12,] 9.112482 9.800689 9.087621 9.129529
#> [13,] 9.514294 9.533490 9.679024 9.561295
#> [14,] 9.346053 9.674688 7.799647 2.678476
#> [15,] 7.630601 6.202994 9.325439 9.159824
#> [16,] 9.797944 9.165025 8.326569 8.390254
#> [17,] 8.501727 9.627054 7.844781 8.935584
#> [18,] 6.737095 8.373260 9.620239 9.413787
#> [19,] 8.788709 9.119590 9.792618 8.858973
#> [20,] 9.856929 8.441187 8.921392 7.707281
SummarizedExperiment::rowRanges(rse)
#> GRanges object with 20 ranges and 1 metadata column:
#> seqnames ranges strand | feature_id
#> <Rle> <IRanges> <Rle> | <character>
#> [1] chr1 409164-409167 - | ID001
#> [2] chr1 691082-691085 + | ID002
#> [3] chr1 388335-388338 + | ID003
#> [4] chr1 268922-268925 - | ID004
#> [5] chr1 804064-804067 - | ID005
#> ... ... ... ... . ...
#> [16] chr2 647861-647864 + | ID016
#> [17] chr2 469620-469623 + | ID017
#> [18] chr2 232385-232388 + | ID018
#> [19] chr2 941769-941772 - | ID019
#> [20] chr2 371106-371109 + | ID020
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
SummarizedExperiment::rowData(rse)
#> DataFrame with 20 rows and 1 column
#> feature_id
#> <character>
#> 1 ID001
#> 2 ID002
#> 3 ID003
#> 4 ID004
#> 5 ID005
#> ... ...
#> 16 ID016
#> 17 ID017
#> 18 ID018
#> 19 ID019
#> 20 ID020
SummarizedExperiment::colData(rse)
#> DataFrame with 4 rows and 1 column
#> Treatment
#> <character>
#> A ChIP
#> B Input
#> C ChIP
#> D Input
rse[, rse$Treatment == "ChIP"]
#> class: RangedSummarizedExperiment
#> dim: 20 2
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(2): A C
#> colData names(1): Treatment
# the same ranges but different samples:
rse1 <- rse
rse2 <- rse1[, 1:3]
colnames(rse2) <- letters[1:ncol(rse2)]
cmb1 <- cbind(rse1, rse2)
dim(cmb1)
#> [1] 20 7
dimnames(cmb1)
#> [[1]]
#> NULL
#>
#> [[2]]
#> [1] "A" "B" "C" "D" "a" "b" "c"
# the same samples but different ranges:
rse1 <- rse
rse2 <- rse1[1:nrows, ]
rownames(rse2) <- letters[1:nrow(rse2)]
cmb2 <- rbind(rse1, rse2)
dim(cmb2)
#> [1] 40 4
dimnames(cmb2)
#> [[1]]
#> [1] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "a" "b" "c" "d" "e" "f" "g" "h" "i"
#> [30] "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t"
#>
#> [[2]]
#> [1] "A" "B" "C" "D"
se0 <- as(rse, "SummarizedExperiment")
se0
#> class: SummarizedExperiment
#> dim: 20 4
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(4): A B C D
#> colData names(1): Treatment
as(se0, "RangedSummarizedExperiment")
#> class: RangedSummarizedExperiment
#> dim: 20 4
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(4): A B C D
#> colData names(1): Treatment
# rowRanges(a SummarizedExperiment object) --> RangedSummarizedExperiment object:
se <- se0
rowRanges(se) <- rowRanges
se
#> class: RangedSummarizedExperiment
#> dim: 20 4
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(4): A B C D
#> colData names(1): Treatment
stopifnot(identical(SummarizedExperiment::assays(se0), SummarizedExperiment::assays(rse)))
stopifnot(identical(dim(se0), dim(rse)))
stopifnot(identical(dimnames(se0), dimnames(rse)))
stopifnot(identical(SummarizedExperiment::rowData(se0), SummarizedExperiment::rowData(rse)))
stopifnot(identical(SummarizedExperiment::colData(se0), SummarizedExperiment::colData(rse)))