Skip to contents
pkgs <- c("Biobase", "arrayQualityMetrics", "knitr", "pQTLtools")
for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) {
    if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install"))
}
invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))

We illustrate Bioconductor/Biobase’s ExpressionSet with its example and finish with a real example. See also the reference section.

dataDirectory <- system.file("extdata", package="Biobase")
exprsFile <- file.path(dataDirectory, "exprsData.txt")
exprs <- as.matrix(read.table(exprsFile, header=TRUE, sep="\t", row.names=1, as.is=TRUE))
pDataFile <- file.path(dataDirectory, "pData.txt")
pData <- read.table(pDataFile, row.names=1, header=TRUE, sep="\t")
all(rownames(pData)==colnames(exprs))
metadata <- data.frame(labelDescription=
                       c("Patient gender",
                         "Case/control status",
                         "Tumor progress on XYZ scale"),
                       row.names=c("gender", "type", "score"))
phenoData <- new("AnnotatedDataFrame", data=pData, varMetadata=metadata)
experimentData <- new("MIAME",
  name="Pierre Fermat",
  lab="Francis Galton Lab",
  contact="pfermat@lab.not.exist",
  title="Smoking-Cancer Experiment",
  abstract="An example ExpressionSet",
  url="www.lab.not.exist",
  other=list(notes="Created from text files"))
exampleSet <- make_ExpressionSet(exprs,phenoData,experimentData=experimentData,
                                 annotation="hgu95av2")
data(sample.ExpressionSet)
identical(exampleSet,sample.ExpressionSet)

data.frame

The great benefit is to use the object directly as a data.frame.

lm.result <- esApply(exampleSet,1,function(x) lm(score~gender+x))
beta.x <- unlist(lapply(lapply(lm.result,coef),"[",3))
beta.x[1]
#> AFFX-MurIL2_at.x 
#>    -0.0001907472
lm(score~gender+AFFX.MurIL2_at,data=exampleSet)
#> 
#> Call:
#> lm(formula = score ~ gender + AFFX.MurIL2_at, data = exampleSet)
#> 
#> Coefficients:
#>    (Intercept)      genderMale  AFFX.MurIL2_at  
#>      0.5531725       0.0098932      -0.0001907

Composite plots

We wish to examine the distribution of each feature via histogram, scatter and boxplot. One could resort to esApply for its simplicity as before

invisible(esApply(exampleSet[1:2],1,function(x){par(mfrow=c(3,1));boxplot(x);hist(x);plot(x)}))

but it is nicer to add feature name in the title.

f <- featureNames(exampleSet[1:2])
invisible(sapply(f,function(x) {
                     d <- t(exprs(exampleSet[x]))
                     fn <- featureNames(exampleSet[x])
                     par(mfrow=c(3,1))
                     hist(d,main="",xlab=fn); plot(d, ylab=fn); boxplot(d,ylab=fn)
                   }
          )
)

where the expression set is indexed using feature name(s).

Outlier detections

This illustrates one mechanism,

list_outliers <- function(es, method="upperquartile") outliers(exprs(es),method=method)
for (method in c("KS","sum","upperquartile"))
{
  ZWK_outliers <- list_outliers(protein_ZWK,method=method)
  print(ZWK_outliers@statistic[ZWK_outliers@which])
}

Data transformation

Suppose we wish use log2 for those greater than zero but set those negative values to be missing.

log2.na <- function(x) log2(ifelse(x>0, x, NA))
exprs(exampleSet) <- log2.na(exprs(exampleSet))

Limit of detection (LOD)

We generate a lod.max ~ U[0,1] variable to experiment

fData(exampleSet)
#> data frame with 0 columns and 500 rows
fData(exampleSet)$lod.max <- apply(exprs(exampleSet),1,quantile,runif(nrow(exampleSet)))
lod <- get.prop.below.LLOD(exampleSet)
x <- dplyr::arrange(fData(lod),desc(pc.belowLOD.new))
kable(head(lod))
AFFX.MurIL2_at AFFX.MurIL10_at AFFX.MurIL4_at AFFX.MurFAS_at AFFX.BioB.5_at AFFX.BioB.M_at gender type score
A 192.7420 97.13700 45.81920 22.54450 96.7875 89.0730 Female Control 0.75
B 85.7533 126.19600 8.83135 3.60093 30.4380 25.8461 Male Case 0.40
C 176.7570 77.92160 33.06320 14.68830 46.1271 57.2033 Male Control 0.73
D 135.5750 93.37130 28.70720 12.33970 70.9319 69.9766 Male Case 0.42
E 64.4939 24.39860 5.94492 36.86630 56.1744 49.5822 Female Case 0.93
F 76.3569 85.50880 28.29250 11.25680 42.6756 26.1262 Male Control 0.22
G 160.5050 98.90860 30.96940 23.00340 86.5156 75.0083 Male Case 0.96
H 65.9631 81.69320 14.79230 16.21340 30.7927 42.3352 Male Case 0.79
I 56.9039 97.80150 14.23990 12.03750 19.7183 41.1207 Female Case 0.37
J 135.6080 90.48380 34.48740 4.54978 46.3520 91.5307 Male Control 0.63
K 63.4432 70.57330 20.35210 8.51782 39.1326 39.9136 Male Case 0.26
L 78.2126 94.54180 14.15540 27.28520 41.7698 49.8397 Female Control 0.36
M 83.0943 75.34550 20.62510 10.16160 80.2197 63.4794 Male Case 0.41
N 89.3372 68.58270 15.92310 20.24880 36.4903 24.7007 Male Case 0.80
O 91.0615 87.40500 20.15790 15.78490 36.4021 47.4641 Female Case 0.10
P 95.9377 84.45810 27.81390 14.32760 35.3054 47.3578 Female Control 0.41
Q 179.8450 87.68060 32.79110 15.94880 58.6239 58.1331 Female Case 0.16
R 152.4670 108.03200 33.52920 14.67530 114.0620 104.1220 Male Control 0.72
S 180.8340 134.26300 19.81720 -7.91911 93.4402 115.8310 Male Case 0.17
T 85.4146 91.40310 20.41900 12.88750 22.5168 58.1224 Female Case 0.74
U 157.9890 -8.68811 26.87200 11.91860 48.6462 73.4221 Male Control 0.35
V 146.8000 85.02120 31.14880 12.83240 90.2215 64.6066 Female Control 0.77
W 93.8829 79.29980 22.34200 11.13900 42.0053 40.3068 Male Control 0.27
X 103.8550 71.65520 19.01350 7.55564 57.5738 41.8209 Male Control 0.98
Y 64.4340 64.23690 12.16860 19.98490 44.8216 46.1087 Female Case 0.94
Z 175.6150 78.70680 17.37800 8.96849 61.7044 49.4122 Female Case 0.32
plot(x[,2], main="Random quantile cut off", ylab="<lod%")

The quantity has been shown to have a big impact on protein abundance and therefore pQTL detection as is shown with a real example.

rm(list=ls())
dir <- "~/rds/post_qc_data/interval/phenotype/olink_proteomics/post-qc/"
eset <- readRDS(paste0(dir,"eset.inf1.flag.out.outlier.in.rds"))
x <- get.prop.below.LLOD(eset)
annot <- fData(x)
annot$MissDataProp <- as.numeric(gsub("\\%$", "", annot$MissDataProp))
plot(annot$MissDataProp, annot$pc.belowLOD.new, xlab="% <LLOD in Original",
     ylab="% <LLOD in post QC dataset", pch=19)
INF <- Sys.getenv("INF")
np <- read.table(paste(INF, "work", "INF1.merge.nosig", sep="/"), header=FALSE,
                 col.names = c("prot", "uniprot"))
kable(np, caption="Proteins with no pQTL")
annot$pQTL <- rep(NA, nrow(annot))
no.pQTL.ind <- which(annot$uniprot.id %in% np$uniprot)
annot$pQTL[no.pQTL.ind] <- "red"
annot$pQTL[-no.pQTL.ind] <- "blue"
annot <- annot[order(annot$pc.belowLOD.new, decreasing = TRUE),]
annot <- annot[-grep("^BDNF$", annot$ID),]
saveRDS(annot,file=file.path("~","pQTLtools","tests","annot.RDS"))
par(mar=c(5,4,1,1))
annot <- readRDS(file.path("~","pQTLtools","tests","annot.RDS"))
plot(annot$pc.belowLOD.new, col=annot$pQTL, las=1,
     ylab = "% samples with very low abundance per protein", xaxt= 'n',
     xlab = "ordered proteins", cex=0.8, pch=19)
axis(side=1, at = seq(from=0, to=nrow(annot), by=20))
legend("topright", legend= c("no pQTL", "pQTL"), col= c("red","blue"), pch = 19)

options(width=120)
kable(annot,caption="Summary statistics",row.names=FALSE)
Summary statistics
ID dichot olink.id uniprot.id lod.max MissDataProp pc.belowLOD.new pQTL
IL.1.alpha FALSE 125_IL-1 alpha P01583 4.76 99.88 99.8776010 blue
IL.2 FALSE 127_IL-2 P60568 0.93 99.86 99.8776010 red
IL.22.RA1 FALSE 150_IL-22 RA1 Q8N6P7 1.79 99.61 99.6124031 red
IFN.gamma FALSE 178_IFN-gamma P01579 1.40 99.53 99.5308038 red
TSLP FALSE 129_TSLP Q969D9 1.80 99.49 99.4900041 red
IL.33 FALSE 177_IL-33 O95760 1.17 98.54 98.5516116 red
NRTN FALSE 182_NRTN Q99748 1.39 97.11 97.1236230 red
IL.2RB FALSE 124_IL-2RB P14784 1.47 97.11 97.1032232 red
IL.24 FALSE 158_IL-24 Q13007 1.95 97.13 97.0828233 red
IL.20 FALSE 171_IL-20 Q9NYY1 1.30 96.72 96.7156263 red
IL.13 FALSE 159_IL-13 P35225 1.71 96.51 96.5524276 red
IL.15RA FALSE 148_IL-15RA Q13261 1.18 96.10 96.1648307 blue
ARTN FALSE 160_ARTN Q5T4W7 0.73 95.84 95.9404325 red
IL.20RA FALSE 121_IL-20RA Q9UHF4 0.79 95.16 95.7772338 red
TNF FALSE 163_TNF P01375 1.19 95.22 95.1244390 red
LIF FALSE 181_LIF P15018 1.45 94.90 94.8184415 red
IL.4 FALSE 180_IL-4 P05112 1.11 93.91 94.4104447 red
IL.17A FALSE 116_IL-17A Q16552 0.97 84.73 84.7001224 red
IL.17C FALSE 114_IL-17C Q9P0M4 1.28 83.07 83.3741330 blue
MCP.3 FALSE 105_MCP-3 P80098 1.31 79.30 79.0697674 blue
IL.5 FALSE 193_IL-5 P05113 1.55 78.21 77.8457772 red
IL.6 FALSE 113_IL-6 P05231 1.97 69.06 68.6250510 blue
IL.10RA FALSE 140_IL-10RA Q13651 1.14 53.30 53.9779682 red
GDNF FALSE 106_GDNF P39905 1.13 43.26 42.4520604 blue
ST1A1 FALSE 191_ST1A1 P50225 1.34 32.68 31.2525500 blue
FGF.5 FALSE 141_FGF-5 Q8NF90 1.00 31.71 30.3957568 blue
NT.3 FALSE 188_NT-3 P20783 0.85 12.77 13.2394941 blue
AXIN1 FALSE 118_AXIN1 O15169 1.30 13.04 10.8731130 red
IL.10 FALSE 162_IL-10 P22301 1.50 7.71 7.0583435 blue
CCL28 FALSE 173_CCL28 Q9NRJ3 0.92 5.49 5.2835577 red
Beta.NGF FALSE 153_Beta-NGF P01138 1.23 3.96 4.2227662 blue
EN.RAGE FALSE 175_EN-RAGE P80511 1.57 3.28 3.5291718 blue
SLAMF1 FALSE 134_SLAMF1 Q13291 1.55 2.13 1.5299878 blue
IL.7 FALSE 109_IL-7 P13232 1.39 1.43 0.7547940 blue
IL.8 FALSE 101_IL-8 P10145 3.17 0.25 0.2447980 blue
SIRT2 FALSE 172_SIRT2 Q8IXJ6 1.52 1.00 0.0815993 blue
MMP.1 FALSE 142_MMP-1 P03956 1.89 0.06 0.0407997 blue
MIP.1.alpha FALSE 166_MIP-1 alpha P10147 1.92 0.06 0.0203998 blue
VEGF.A FALSE 102_VEGF-A P15692 3.15 0.02 0.0000000 blue
CDCP1 FALSE 107_CDCP1 Q9H5V8 0.21 0.04 0.0000000 blue
CD244 FALSE 108_CD244 Q9BZW8 1.09 0.02 0.0000000 blue
OPG FALSE 110_OPG O00300 1.81 0.02 0.0000000 blue
LAP.TGF.beta.1 FALSE 111_LAP TGF-beta-1 P01137 0.86 0.02 0.0000000 blue
uPA FALSE 112_uPA P00749 1.75 0.02 0.0000000 blue
MCP.1 FALSE 115_MCP-1 P13500 1.90 0.02 0.0000000 blue
CXCL11 FALSE 117_CXCL11 O14625 1.40 0.02 0.0000000 blue
TRAIL FALSE 120_TRAIL P50591 1.22 0.02 0.0000000 blue
CXCL9 FALSE 122_CXCL9 Q07325 1.04 0.02 0.0000000 blue
CST5 FALSE 123_CST5 P28325 4.34 0.04 0.0000000 blue
OSM FALSE 126_OSM P13725 1.32 0.06 0.0000000 blue
CXCL1 FALSE 128_CXCL1 P09341 3.45 0.02 0.0000000 blue
CCL4 FALSE 130_CCL4 P13236 2.15 0.02 0.0000000 blue
CD6 FALSE 131_CD6 Q8WWJ7 1.78 0.04 0.0000000 blue
SCF FALSE 132_SCF P21583 2.30 0.02 0.0000000 blue
IL.18 FALSE 133_IL-18 Q14116 2.79 0.02 0.0000000 blue
TGF.alpha FALSE 135_TGF-alpha P01135 0.07 0.02 0.0000000 blue
MCP.4 FALSE 136_MCP-4 Q99616 0.56 0.04 0.0000000 blue
CCL11 FALSE 137_CCL11 P51671 2.09 0.02 0.0000000 blue
TNFSF14 FALSE 138_TNFSF14 O43557 1.81 0.02 0.0000000 blue
FGF.23 FALSE 139_FGF-23 Q9GZV9 0.66 0.04 0.0000000 blue
LIF.R FALSE 143_LIF-R P42702 2.08 0.04 0.0000000 blue
FGF.21 FALSE 144_FGF-21 Q9NSA1 1.14 0.02 0.0000000 blue
CCL19 FALSE 145_CCL19 Q99731 2.13 0.02 0.0000000 blue
IL.10RB FALSE 149_IL-10RB Q08334 1.72 0.02 0.0000000 blue
IL.18R1 FALSE 151_IL-18R1 Q13478 1.21 0.02 0.0000000 blue
PD.L1 FALSE 152_PD-L1 Q9NZQ7 2.14 0.04 0.0000000 blue
CXCL5 FALSE 154_CXCL5 P42830 1.92 0.02 0.0000000 blue
TRANCE FALSE 155_TRANCE O14788 1.51 0.04 0.0000000 blue
HGF FALSE 156_HGF P14210 1.77 0.02 0.0000000 blue
IL.12B FALSE 157_IL-12B P29460 0.85 0.02 0.0000000 blue
MMP.10 FALSE 161_MMP-10 P09238 1.56 0.02 0.0000000 blue
CCL23 FALSE 164_CCL23 P55773 2.10 0.02 0.0000000 blue
CD5 FALSE 165_CD5 P06127 1.42 0.02 0.0000000 blue
Flt3L FALSE 167_Flt3L P49771 2.12 0.02 0.0000000 blue
CXCL6 FALSE 168_CXCL6 P80162 1.80 0.02 0.0000000 blue
CXCL10 FALSE 169_CXCL10 P02778 2.03 0.02 0.0000000 blue
4E.BP1 FALSE 170_4E-BP1 Q13541 1.04 0.10 0.0000000 blue
DNER FALSE 174_DNER Q8NFT8  1.63 0.02 0.0000000 blue
CD40 FALSE 176_CD40 P25942 1.91 0.04 0.0000000 blue
FGF.19 FALSE 179_FGF-19 O95750 1.25 0.02 0.0000000 blue
MCP.2 FALSE 183_MCP-2 P80075 2.10 0.02 0.0000000 blue
CASP.8 FALSE 184_CASP-8 Q14790 1.38 0.04 0.0000000 blue
CCL25 FALSE 185_CCL25 O15444 1.32 0.02 0.0000000 blue
CX3CL1 FALSE 186_CX3CL1 P78423 1.76 0.02 0.0000000 blue
TNFRSF9 FALSE 187_TNFRSF9 Q07011 1.86 0.02 0.0000000 blue
TWEAK FALSE 189_TWEAK O43508 1.78 0.02 0.0000000 blue
CCL20 FALSE 190_CCL20 P78556 1.42 0.02 0.0000000 blue
STAMPB FALSE 192_STAMPB O95630 1.40 0.06 0.0000000 red
ADA FALSE 194_ADA P00813 1.80 0.04 0.0000000 blue
TNFB FALSE 195_TNFB P01374 1.10 0.04 0.0000000 blue
CSF.1 FALSE 196_CSF-1 P09603 1.02 0.02 0.0000000 blue

maEndtoEnd

Web: https://bioconductor.org/packages/release/workflows/html/maEndToEnd.html.

Examples can be found on PCA, heatmap, normalisation, linear models, and enrichment analysis from this Bioconductor package.