ExpressionSet/SummarizedExperiment usage • pQTLtools

es_pkgs <- c("Biobase", "arrayQualityMetrics", "dplyr", "knitr", "mclust", "pQTLtools", "rgl", "scatterplot3d")
se_pkgs <- c("BiocGenerics","GenomicRanges","IRanges","MsCoreUtils","S4Vectors","SummarizedExperiment")
pkgs <- c(es_pkgs,se_pkgs)
for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) {
    if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install"))
}
invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))

We start with Bioconductor/Biobase’s ExpressionSet example and finish with a real application.

dataDirectory <- system.file("extdata", package="Biobase")
exprsFile <- file.path(dataDirectory, "exprsData.txt")
exprs <- as.matrix(read.table(exprsFile, header=TRUE, sep="\t", row.names=1, as.is=TRUE))
pDataFile <- file.path(dataDirectory, "pData.txt")
pData <- read.table(pDataFile, row.names=1, header=TRUE, sep="\t")
all(rownames(pData)==colnames(exprs))
metadata <- data.frame(labelDescription=c("Patient gender",
                                          "Case/control status",
                                          "Tumor progress on XYZ scale"),
                       row.names=c("gender", "type", "score"))
phenoData <- Biobase::AnnotatedDataFrame(data=pData, varMetadata=metadata)
experimentData <- Biobase::MIAME(name="Pierre Fermat",
                                 lab="Francis Galton Lab",
                                 contact="pfermat@lab.not.exist",
                                 title="Smoking-Cancer Experiment",
                                 abstract="An example ExpressionSet",
                                 url="www.lab.not.exist",
                                 other=list(notes="Created from text files"))
exampleSet <- pQTLtools::make_ExpressionSet(exprs,phenoData,experimentData=experimentData,
                                            annotation="hgu95av2")
data(sample.ExpressionSet, package="Biobase")
identical(exampleSet,sample.ExpressionSet)

1 data.frame

The great benefit is to use the object directly as a data.frame.

lm.result <- Biobase::esApply(exampleSet,1,function(x) lm(score~gender+x))
beta.x <- unlist(lapply(lapply(lm.result,coef),"[",3))
beta.x[1]
#> AFFX-MurIL2_at.x 
#>    -0.0001907472
lm(score~gender+AFFX.MurIL2_at,data=exampleSet)
#> 
#> Call:
#> lm(formula = score ~ gender + AFFX.MurIL2_at, data = exampleSet)
#> 
#> Coefficients:
#>    (Intercept)      genderMale  AFFX.MurIL2_at  
#>      0.5531725       0.0098932      -0.0001907

2 Composite plots

We wish to examine the distribution of each feature via histogram, scatter and boxplot. One could resort to esApply() for its simplicity as before

invisible(Biobase::esApply(exampleSet[1:2],1,function(x)
                           {par(mfrow=c(3,1));boxplot(x);hist(x);plot(x)}
))

but it is nicer to add feature name in the title.

par(mfrow=c(1,3))
f <- Biobase::featureNames(exampleSet[1:2])
invisible(sapply(f,function(x) {
                     d <- t(Biobase::exprs(exampleSet[x]))
                     fn <- Biobase::featureNames(exampleSet[x])
                     hist(d,main="",xlab=fn); plot(d, ylab=fn); boxplot(d,ylab=fn)
                   }
          )
)

Figure 2.1: Histogram, scatter & boxplots

Figure 2.2: Histogram, scatter & boxplots

where the expression set is indexed using feature name(s).

3 Outlier detections

This illustrates one mechanism,

list_outliers <- function(es, method="upperquartile")
                 arrayQualityMetrics::outliers(exprs(es),method=method)
for (method in c("KS","sum","upperquartile"))
{
  ZWK_outliers <- list_outliers(protein_ZWK,method=method)
  print(ZWK_outliers@statistic[ZWK_outliers@which])
}

4 Clustering

We employ model-based clustering absed on principal compoents to see potential groupings in the data,

  pca <- prcomp(na.omit(t(Biobase::exprs(exampleSet))), rank=10, scale=TRUE)
  pc1pc2pc3 <- with(pca,x)[,1:3]
  mc <- mclust::Mclust(pc1pc2pc3,G=3)
  with(mc, {
      cols <- c("blue","red", "purple")
      s3d <- scatterplot3d::scatterplot3d(with(pca,x[,c(2,1,3)]),
                                          color=cols[classification],
                                          pch=16,
                                          type="h",
                                          main="Plot of the PC1, PC2 and PC3")
      s3d.coords <- s3d$xyz.convert(with(pca,x[,c(2,1,3)]))
      text(s3d.coords$x, 
           s3d.coords$y,   
           cex = 1.2,
           col = cols[classification],
           labels = row.names(pc1pc2pc3),
           pos = 4)
      legend("right", legend=levels(as.factor(classification)), col=cols[classification], pch=16)
      rgl::open3d(width = 500, height = 500)
      rgl::plot3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],size=5)
      rgl::text3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],texts=row.names(pc1pc2pc3))
      htmlwidgets::saveWidget(rgl::rglwidget(), file = "mcpca3d.html")
  })

Figure 4.1: Three-group Clustering

#> Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
#> parameter "width" cannot be set
#> Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
#> parameter "height" cannot be set

An interactive version is also available,

5 Data transformation

Suppose we wish use log2 for those greater than zero but set those negative values to be missing.

log2.na <- function(x) log2(ifelse(x>0, x, NA))
Biobase::exprs(exampleSet) <- log2.na(Biobase::exprs(exampleSet))

6 Limit of detection (LOD)

We generate a lod.max ~ U[0,1] variable to experiment

Biobase::fData(exampleSet)
#> data frame with 0 columns and 500 rows
Biobase::fData(exampleSet)$lod.max <- apply(Biobase::exprs(exampleSet),1,quantile,runif(nrow(exampleSet)))
lod <- pQTLtools::get.prop.below.LLOD(exampleSet)
x <- dplyr::arrange(Biobase::fData(lod),desc(pc.belowLOD.new))
knitr::kable(head(lod))

	AFFX.MurIL2_at	AFFX.MurIL10_at	AFFX.MurIL4_at	AFFX.MurFAS_at	AFFX.BioB.5_at	AFFX.BioB.M_at	gender	type	score
A	192.7420	97.13700	45.81920	22.54450	96.7875	89.0730	Female	Control	0.75
B	85.7533	126.19600	8.83135	3.60093	30.4380	25.8461	Male	Case	0.40
C	176.7570	77.92160	33.06320	14.68830	46.1271	57.2033	Male	Control	0.73
D	135.5750	93.37130	28.70720	12.33970	70.9319	69.9766	Male	Case	0.42
E	64.4939	24.39860	5.94492	36.86630	56.1744	49.5822	Female	Case	0.93
F	76.3569	85.50880	28.29250	11.25680	42.6756	26.1262	Male	Control	0.22
G	160.5050	98.90860	30.96940	23.00340	86.5156	75.0083	Male	Case	0.96
H	65.9631	81.69320	14.79230	16.21340	30.7927	42.3352	Male	Case	0.79
I	56.9039	97.80150	14.23990	12.03750	19.7183	41.1207	Female	Case	0.37
J	135.6080	90.48380	34.48740	4.54978	46.3520	91.5307	Male	Control	0.63
K	63.4432	70.57330	20.35210	8.51782	39.1326	39.9136	Male	Case	0.26
L	78.2126	94.54180	14.15540	27.28520	41.7698	49.8397	Female	Control	0.36
M	83.0943	75.34550	20.62510	10.16160	80.2197	63.4794	Male	Case	0.41
N	89.3372	68.58270	15.92310	20.24880	36.4903	24.7007	Male	Case	0.80
O	91.0615	87.40500	20.15790	15.78490	36.4021	47.4641	Female	Case	0.10
P	95.9377	84.45810	27.81390	14.32760	35.3054	47.3578	Female	Control	0.41
Q	179.8450	87.68060	32.79110	15.94880	58.6239	58.1331	Female	Case	0.16
R	152.4670	108.03200	33.52920	14.67530	114.0620	104.1220	Male	Control	0.72
S	180.8340	134.26300	19.81720	-7.91911	93.4402	115.8310	Male	Case	0.17
T	85.4146	91.40310	20.41900	12.88750	22.5168	58.1224	Female	Case	0.74
U	157.9890	-8.68811	26.87200	11.91860	48.6462	73.4221	Male	Control	0.35
V	146.8000	85.02120	31.14880	12.83240	90.2215	64.6066	Female	Control	0.77
W	93.8829	79.29980	22.34200	11.13900	42.0053	40.3068	Male	Control	0.27
X	103.8550	71.65520	19.01350	7.55564	57.5738	41.8209	Male	Control	0.98
Y	64.4340	64.23690	12.16860	19.98490	44.8216	46.1087	Female	Case	0.94
Z	175.6150	78.70680	17.37800	8.96849	61.7044	49.4122	Female	Case	0.32

plot(x[,2], main="Random quantile cutoff", ylab="<lod%")

Figure 6.1: LOD based on a random cutoff

The quantity has been shown to have a big impact on protein abundance and therefore pQTL detection as is shown with a real example.

rm(list=ls())
dir <- "~/rds/post_qc_data/interval/phenotype/olink_proteomics/post-qc/"
eset <- readRDS(paste0(dir,"eset.inf1.flag.out.outlier.in.rds"))
x <- pQTLtools::get.prop.below.LLOD(eset)
annot <- Biobase::fData(x)
annot$MissDataProp <- as.numeric(gsub("\\%$", "", annot$MissDataProp))
plot(annot$MissDataProp, annot$pc.belowLOD.new, xlab="% <LLOD in Original",
     ylab="% <LLOD in post QC dataset", pch=19)
INF <- Sys.getenv("INF")
np <- read.table(paste(INF, "work", "INF1.merge.nosig", sep="/"), header=FALSE,
                 col.names = c("prot", "uniprot"))
kable(np, caption="Proteins with no pQTL")
annot$pQTL <- rep(NA, nrow(annot))
no.pQTL.ind <- which(annot$uniprot.id %in% np$uniprot)
annot$pQTL[no.pQTL.ind] <- "red"
annot$pQTL[-no.pQTL.ind] <- "blue"
annot <- annot[order(annot$pc.belowLOD.new, decreasing = TRUE),]
annot <- annot[-grep("^BDNF$", annot$ID),]
saveRDS(annot,file=file.path("~","pQTLtools","tests","annot.RDS"))

annot <- readRDS(file.path(find.package("pQTLtools"),"tests","annot.RDS")) %>%
         dplyr::left_join(pQTLdata::inf1[c("prot","target.short","alt_name")],by=c("ID"="prot")) %>%
         dplyr::mutate(prot=if_else(is.na(alt_name),target.short,alt_name),order=1:n()) %>%
         dplyr::arrange(desc(order))
xtick <- seq(1, nrow(annot))
attach(annot)
par(mar=c(10,5,1,1))
plot(100-pc.belowLOD.new,cex=2,pch=19,col=pQTL,xaxt="n",xlab="",ylab="",cex.axis=0.8)
text(66,16,"IL-17C",offset=0,pos=2,cex=1.5,font=2,srt=0)
arrows(67,16,71,16,lwd=2)
axis(1, at=xtick, labels=prot, lwd.tick=0.5, lwd=0, las=2, hadj=1, cex.axis=0.8)
mtext("% samples above LLOD",side=2,line=2.5,cex=1.2)
mtext("Ordered protein",side=1,line=6.5,cex=1.2,font=1)
legend(x=1,y=25,c("without pQTL","with pQTL"),box.lwd=0,cex=2,col=c("red","blue"),pch=19)

Figure 6.2: LOD in SCALLOP-INF/INTERVAL

detach(annot)
options(width=120)
knitr::kable(annot,caption="Summary statistics",row.names=FALSE)

Table 6.1: Summary statistics
ID	dichot	olink.id	uniprot.id	lod.max	MissDataProp	pc.belowLOD.new	pQTL	target.short	alt_name	prot	order
CSF.1	FALSE	196_CSF-1	P09603	1.02	0.02	0.0000000	blue	CSF-1	NA	CSF-1	91
TNFB	FALSE	195_TNFB	P01374	1.10	0.04	0.0000000	blue	TNFB	NA	TNFB	90
ADA	FALSE	194_ADA	P00813	1.80	0.04	0.0000000	blue	ADA	NA	ADA	89
STAMPB	FALSE	192_STAMPB	O95630	1.40	0.06	0.0000000	red	STAMPB	NA	STAMPB	88
CCL20	FALSE	190_CCL20	P78556	1.42	0.02	0.0000000	blue	CCL20	NA	CCL20	87
TWEAK	FALSE	189_TWEAK	O43508	1.78	0.02	0.0000000	blue	TWEAK	NA	TWEAK	86
TNFRSF9	FALSE	187_TNFRSF9	Q07011	1.86	0.02	0.0000000	blue	TNFRSF9	NA	TNFRSF9	85
CX3CL1	FALSE	186_CX3CL1	P78423	1.76	0.02	0.0000000	blue	CX3CL1	NA	CX3CL1	84
CCL25	FALSE	185_CCL25	O15444	1.32	0.02	0.0000000	blue	CCL25	NA	CCL25	83
CASP.8	FALSE	184_CASP-8	Q14790	1.38	0.04	0.0000000	blue	CASP-8	NA	CASP-8	82
MCP.2	FALSE	183_MCP-2	P80075	2.10	0.02	0.0000000	blue	MCP-2	CCL8	CCL8	81
FGF.19	FALSE	179_FGF-19	O95750	1.25	0.02	0.0000000	blue	FGF-19	NA	FGF-19	80
CD40	FALSE	176_CD40	P25942	1.91	0.04	0.0000000	blue	CD40	NA	CD40	79
DNER	FALSE	174_DNER	Q8NFT8	1.63	0.02	0.0000000	blue	DNER	NA	DNER	78
4E.BP1	FALSE	170_4E-BP1	Q13541	1.04	0.10	0.0000000	blue	4EBP1	NA	4EBP1	77
CXCL10	FALSE	169_CXCL10	P02778	2.03	0.02	0.0000000	blue	CXCL10	NA	CXCL10	76
CXCL6	FALSE	168_CXCL6	P80162	1.80	0.02	0.0000000	blue	CXCL6	CXCL6	CXCL6	75
Flt3L	FALSE	167_Flt3L	P49771	2.12	0.02	0.0000000	blue	FIt3L	NA	FIt3L	74
CD5	FALSE	165_CD5	P06127	1.42	0.02	0.0000000	blue	CD5	NA	CD5	73
CCL23	FALSE	164_CCL23	P55773	2.10	0.02	0.0000000	blue	CCL23	NA	CCL23	72
MMP.10	FALSE	161_MMP-10	P09238	1.56	0.02	0.0000000	blue	MMP-10	NA	MMP-10	71
IL.12B	FALSE	157_IL-12B	P29460	0.85	0.02	0.0000000	blue	IL-12B	NA	IL-12B	70
HGF	FALSE	156_HGF	P14210	1.77	0.02	0.0000000	blue	HGF	NA	HGF	69
TRANCE	FALSE	155_TRANCE	O14788	1.51	0.04	0.0000000	blue	TRANCE	NA	TRANCE	68
CXCL5	FALSE	154_CXCL5	P42830	1.92	0.02	0.0000000	blue	CXCL5	NA	CXCL5	67
PD.L1	FALSE	152_PD-L1	Q9NZQ7	2.14	0.04	0.0000000	blue	PD-L1	NA	PD-L1	66
IL.18R1	FALSE	151_IL-18R1	Q13478	1.21	0.02	0.0000000	blue	IL-18R1	NA	IL-18R1	65
IL.10RB	FALSE	149_IL-10RB	Q08334	1.72	0.02	0.0000000	blue	IL10RB	NA	IL10RB	64
CCL19	FALSE	145_CCL19	Q99731	2.13	0.02	0.0000000	blue	CCL19	NA	CCL19	63
FGF.21	FALSE	144_FGF-21	Q9NSA1	1.14	0.02	0.0000000	blue	FGF-21	NA	FGF-21	62
LIF.R	FALSE	143_LIF-R	P42702	2.08	0.04	0.0000000	blue	LIF-R	NA	LIF-R	61
FGF.23	FALSE	139_FGF-23	Q9GZV9	0.66	0.04	0.0000000	blue	FGF-23	NA	FGF-23	60
TNFSF14	FALSE	138_TNFSF14	O43557	1.81	0.02	0.0000000	blue	TNFSF14	NA	TNFSF14	59
CCL11	FALSE	137_CCL11	P51671	2.09	0.02	0.0000000	blue	CCL11	CCL11	CCL11	58
MCP.4	FALSE	136_MCP-4	Q99616	0.56	0.04	0.0000000	blue	MCP-4	CCL13	CCL13	57
TGF.alpha	FALSE	135_TGF-alpha	P01135	0.07	0.02	0.0000000	blue	TGF-alpha	NA	TGF-alpha	56
IL.18	FALSE	133_IL-18	Q14116	2.79	0.02	0.0000000	blue	IL-18	NA	IL-18	55
SCF	FALSE	132_SCF	P21583	2.30	0.02	0.0000000	blue	SCF	NA	SCF	54
CD6	FALSE	131_CD6	Q8WWJ7	1.78	0.04	0.0000000	blue	CD6	NA	CD6	53
CCL4	FALSE	130_CCL4	P13236	2.15	0.02	0.0000000	blue	CCL4	NA	CCL4	52
CXCL1	FALSE	128_CXCL1	P09341	3.45	0.02	0.0000000	blue	CXCL1	NA	CXCL1	51
OSM	FALSE	126_OSM	P13725	1.32	0.06	0.0000000	blue	OSM	NA	OSM	50
CST5	FALSE	123_CST5	P28325	4.34	0.04	0.0000000	blue	CST5	NA	CST5	49
CXCL9	FALSE	122_CXCL9	Q07325	1.04	0.02	0.0000000	blue	CXCL9	NA	CXCL9	48
TRAIL	FALSE	120_TRAIL	P50591	1.22	0.02	0.0000000	blue	TRAIL	NA	TRAIL	47
CXCL11	FALSE	117_CXCL11	O14625	1.40	0.02	0.0000000	blue	CXCL11	NA	CXCL11	46
MCP.1	FALSE	115_MCP-1	P13500	1.90	0.02	0.0000000	blue	MCP-1	CCL2	CCL2	45
uPA	FALSE	112_uPA	P00749	1.75	0.02	0.0000000	blue	uPA	NA	uPA	44
LAP.TGF.beta.1	FALSE	111_LAP TGF-beta-1	P01137	0.86	0.02	0.0000000	blue	LAP TGF-beta-1	NA	LAP TGF-beta-1	43
OPG	FALSE	110_OPG	O00300	1.81	0.02	0.0000000	blue	OPG	NA	OPG	42
CD244	FALSE	108_CD244	Q9BZW8	1.09	0.02	0.0000000	blue	CD244	NA	CD244	41
CDCP1	FALSE	107_CDCP1	Q9H5V8	0.21	0.04	0.0000000	blue	CDCP1	NA	CDCP1	40
VEGF.A	FALSE	102_VEGF-A	P15692	3.15	0.02	0.0000000	blue	VEGF_A	NA	VEGF_A	39
MIP.1.alpha	FALSE	166_MIP-1 alpha	P10147	1.92	0.06	0.0203998	blue	MIP-1 alpha	NA	MIP-1 alpha	38
MMP.1	FALSE	142_MMP-1	P03956	1.89	0.06	0.0407997	blue	MMP-1	NA	MMP-1	37
SIRT2	FALSE	172_SIRT2	Q8IXJ6	1.52	1.00	0.0815993	blue	SIRT2	NA	SIRT2	36
IL.8	FALSE	101_IL-8	P10145	3.17	0.25	0.2447980	blue	IL-8	NA	IL-8	35
IL.7	FALSE	109_IL-7	P13232	1.39	1.43	0.7547940	blue	IL-7	NA	IL-7	34
SLAMF1	FALSE	134_SLAMF1	Q13291	1.55	2.13	1.5299878	blue	SLAMF1	NA	SLAMF1	33
EN.RAGE	FALSE	175_EN-RAGE	P80511	1.57	3.28	3.5291718	blue	EN-RAGE	NA	EN-RAGE	32
Beta.NGF	FALSE	153_Beta-NGF	P01138	1.23	3.96	4.2227662	blue	Beta-NGF	NA	Beta-NGF	31
CCL28	FALSE	173_CCL28	Q9NRJ3	0.92	5.49	5.2835577	red	CCL28	NA	CCL28	30
IL.10	FALSE	162_IL-10	P22301	1.50	7.71	7.0583435	blue	IL-10	NA	IL-10	29
AXIN1	FALSE	118_AXIN1	O15169	1.30	13.04	10.8731130	red	AXIN1	NA	AXIN1	28
NT.3	FALSE	188_NT-3	P20783	0.85	12.77	13.2394941	blue	NT-3	NA	NT-3	27
FGF.5	FALSE	141_FGF-5	Q8NF90	1.00	31.71	30.3957568	blue	FGF-5	NA	FGF-5	26
ST1A1	FALSE	191_ST1A1	P50225	1.34	32.68	31.2525500	blue	ST1A1	NA	ST1A1	25
GDNF	FALSE	106_GDNF	P39905	1.13	43.26	42.4520604	blue	hGDNF	NA	hGDNF	24
IL.10RA	FALSE	140_IL-10RA	Q13651	1.14	53.30	53.9779682	red	IL-10RA	NA	IL-10RA	23
IL.6	FALSE	113_IL-6	P05231	1.97	69.06	68.6250510	blue	IL-6	NA	IL-6	22
IL.5	FALSE	193_IL-5	P05113	1.55	78.21	77.8457772	red	IL-5	NA	IL-5	21
MCP.3	FALSE	105_MCP-3	P80098	1.31	79.30	79.0697674	blue	MCP-3	CCL7	CCL7	20
IL.17C	FALSE	114_IL-17C	Q9P0M4	1.28	83.07	83.3741330	blue	IL-17C	NA	IL-17C	19
IL.17A	FALSE	116_IL-17A	Q16552	0.97	84.73	84.7001224	red	IL-17A	NA	IL-17A	18
IL.4	FALSE	180_IL-4	P05112	1.11	93.91	94.4104447	red	IL-4	NA	IL-4	17
LIF	FALSE	181_LIF	P15018	1.45	94.90	94.8184415	red	LIF	NA	LIF	16
TNF	FALSE	163_TNF	P01375	1.19	95.22	95.1244390	red	TNF	NA	TNF	15
IL.20RA	FALSE	121_IL-20RA	Q9UHF4	0.79	95.16	95.7772338	red	IL-20RA	NA	IL-20RA	14
ARTN	FALSE	160_ARTN	Q5T4W7	0.73	95.84	95.9404325	red	ARTN	NA	ARTN	13
IL.15RA	FALSE	148_IL-15RA	Q13261	1.18	96.10	96.1648307	blue	IL-15RA	NA	IL-15RA	12
IL.13	FALSE	159_IL-13	P35225	1.71	96.51	96.5524276	red	IL-13	NA	IL-13	11
IL.20	FALSE	171_IL-20	Q9NYY1	1.30	96.72	96.7156263	red	IL-20	NA	IL-20	10
IL.24	FALSE	158_IL-24	Q13007	1.95	97.13	97.0828233	red	IL-24	NA	IL-24	9
IL.2RB	FALSE	124_IL-2RB	P14784	1.47	97.11	97.1032232	red	IL-2RB	NA	IL-2RB	8
NRTN	FALSE	182_NRTN	Q99748	1.39	97.11	97.1236230	red	NRTN	NA	NRTN	7
IL.33	FALSE	177_IL-33	O95760	1.17	98.54	98.5516116	red	IL-33	NA	IL-33	6
TSLP	FALSE	129_TSLP	Q969D9	1.80	99.49	99.4900041	red	TSLP	NA	TSLP	5
IFN.gamma	FALSE	178_IFN-gamma	P01579	1.40	99.53	99.5308038	red	IFN-gamma	NA	IFN-gamma	4
IL.22.RA1	FALSE	150_IL-22 RA1	Q8N6P7	1.79	99.61	99.6124031	red	IL-22RA1	NA	IL-22RA1	3
IL.2	FALSE	127_IL-2	P60568	0.93	99.86	99.8776010	red	IL-2	NA	IL-2	2
IL.1.alpha	FALSE	125_IL-1 alpha	P01583	4.76	99.88	99.8776010	blue	IL-1 alpha	NA	IL-1 alpha	1

7 Combination of proteins and other variables

This is available by design.

8 maEndtoEnd

Web: https://bioconductor.org/packages/release/workflows/html/maEndToEnd.html.

Examples can be found on PCA, heatmap, normalisation, linear models, and enrichment analysis from this Bioconductor package.

9 SummarizedExperiment

This is more modern construct. The following script is adapted from the documentation example, illustrating range summarized experiment (rse) adding missing values.

set.seed(123)
nrows <- 20
ncols <- 4
counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
missing_indices <- sample(length(counts), size = 5, replace = FALSE)
counts[missing_indices] <- NA
rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(5, 15)),
                            IRanges::IRanges(floor(runif(nrows, 1e5, 1e6)), width=ncols),
                            strand=sample(c("+", "-"), nrows, TRUE),
                            feature_id=sprintf("ID%03d", 1:nrows))
colData <- S4Vectors::DataFrame(Treatment=rep(c("ChIP", "Input"), ncols/2),
                                row.names=LETTERS[1:ncols])
rse <- SummarizedExperiment(assays=S4Vectors::SimpleList(counts=counts),
                            rowRanges=rowRanges, colData=colData)
assay(rse)
#>               A         B         C           D
#>  [1,] 2876.4876 8895.5036 1428.8574 6651.486831
#>  [2,] 7883.2630 6928.3413 4146.0488  949.311769
#>  [3,] 4090.3602 6405.4276 4137.8295 3840.312408
#>  [4,] 8830.2910 9942.7035 3689.0857 2744.562062
#>  [5,] 9404.7324 6557.4023 1525.2950 8146.585749
#>  [6,]        NA 7085.5962 1388.9218 4485.714898
#>  [7,] 5281.5268 5441.1162 2331.1080 8100.833466
#>  [8,] 8924.2980 5941.8261 4660.1585 8124.082706
#>  [9,] 5514.7987 2892.3082 2660.4604 7943.628869
#> [10,] 4566.6907 1471.9894        NA 4398.877044
#> [11,] 9568.3766        NA  459.2658 7544.997111
#> [12,] 4533.8882 9023.0882 4422.5585          NA
#> [13,] 6776.0288 6907.3621 7989.4495 7102.113831
#> [14,] 5726.7614 7954.8787 1219.8707    7.247108
#> [15,] 1030.1439  247.1122 5609.9189 4753.690424
#> [16,] 8998.3499 4778.4819 2066.1074 2201.968733
#> [17,] 2461.6313 7584.8369 1276.1890 3798.785561
#> [18,]  421.5533 2164.8630 7533.3253 6128.097262
#> [19,] 3279.8793        NA 8950.5585 3518.627295
#> [20,] 9545.0820 2317.0262 3745.2533 1112.243108
imputed <- MsCoreUtils::impute_knn(as.matrix(SummarizedExperiment::assay(rse)),2)
imputed_counts <- MsCoreUtils::impute_RF(as.matrix(assay(rse)),2)
imputed-imputed_counts
#>              A         B        C        D
#>  [1,]    0.000    0.0000    0.000    0.000
#>  [2,]    0.000    0.0000    0.000    0.000
#>  [3,]    0.000    0.0000    0.000    0.000
#>  [4,]    0.000    0.0000    0.000    0.000
#>  [5,]    0.000    0.0000    0.000    0.000
#>  [6,] 2559.339    0.0000    0.000    0.000
#>  [7,]    0.000    0.0000    0.000    0.000
#>  [8,]    0.000    0.0000    0.000    0.000
#>  [9,]    0.000    0.0000    0.000    0.000
#> [10,]    0.000    0.0000 1791.909    0.000
#> [11,]    0.000 -709.2786    0.000    0.000
#> [12,]    0.000    0.0000    0.000 2467.614
#> [13,]    0.000    0.0000    0.000    0.000
#> [14,]    0.000    0.0000    0.000    0.000
#> [15,]    0.000    0.0000    0.000    0.000
#> [16,]    0.000    0.0000    0.000    0.000
#> [17,]    0.000    0.0000    0.000    0.000
#> [18,]    0.000    0.0000    0.000    0.000
#> [19,]    0.000 1298.3456    0.000    0.000
#> [20,]    0.000    0.0000    0.000    0.000
SummarizedExperiment::assays(rse) <- S4Vectors::SimpleList(counts=imputed_counts)
SummarizedExperiment::assay(rse)
#>               A         B         C           D
#>  [1,] 2876.4876 8895.5036 1428.8574 6651.486831
#>  [2,] 7883.2630 6928.3413 4146.0488  949.311769
#>  [3,] 4090.3602 6405.4276 4137.8295 3840.312408
#>  [4,] 8830.2910 9942.7035 3689.0857 2744.562062
#>  [5,] 9404.7324 6557.4023 1525.2950 8146.585749
#>  [6,] 4439.2947 7085.5962 1388.9218 4485.714898
#>  [7,] 5281.5268 5441.1162 2331.1080 8100.833466
#>  [8,] 8924.2980 5941.8261 4660.1585 8124.082706
#>  [9,] 5514.7987 2892.3082 2660.4604 7943.628869
#> [10,] 4566.6907 1471.9894 4812.3872 4398.877044
#> [11,] 9568.3766 5545.7565  459.2658 7544.997111
#> [12,] 4533.8882 9023.0882 4422.5585 4611.837102
#> [13,] 6776.0288 6907.3621 7989.4495 7102.113831
#> [14,] 5726.7614 7954.8787 1219.8707    7.247108
#> [15,] 1030.1439  247.1122 5609.9189 4753.690424
#> [16,] 8998.3499 4778.4819 2066.1074 2201.968733
#> [17,] 2461.6313 7584.8369 1276.1890 3798.785561
#> [18,]  421.5533 2164.8630 7533.3253 6128.097262
#> [19,] 3279.8793 4566.2289 8950.5585 3518.627295
#> [20,] 9545.0820 2317.0262 3745.2533 1112.243108
SummarizedExperiment::assays(rse) <- S4Vectors::endoapply(SummarizedExperiment::assays(rse), asinh)
SummarizedExperiment::assay(rse)
#>              A        B        C        D
#>  [1,] 8.657472 9.786448 7.957778 9.495743
#>  [2,] 9.665644 9.536523 9.023058 7.548885
#>  [3,] 9.009536 9.458048 9.021074 8.946456
#>  [4,] 9.779090 9.897741 8.906281 8.610524
#>  [5,] 9.842115 9.481497 8.023090 9.698501
#>  [6,] 9.091398 9.558966 7.929430 9.101800
#>  [7,] 9.265118 9.294887 8.447246 9.692869
#>  [8,] 9.789680 9.382919 9.139952 9.695735
#>  [9,] 9.308338 8.662957 8.579402 9.673273
#> [10,] 9.119691 7.987517 9.172096 9.082252
#> [11,] 9.859366 9.313936 6.822778 9.621787
#> [12,] 9.112482 9.800689 9.087621 9.129529
#> [13,] 9.514294 9.533490 9.679024 9.561295
#> [14,] 9.346053 9.674688 7.799647 2.678476
#> [15,] 7.630601 6.202994 9.325439 9.159824
#> [16,] 9.797944 9.165025 8.326569 8.390254
#> [17,] 8.501727 9.627054 7.844781 8.935584
#> [18,] 6.737095 8.373260 9.620239 9.413787
#> [19,] 8.788709 9.119590 9.792618 8.858973
#> [20,] 9.856929 8.441187 8.921392 7.707281

SummarizedExperiment::rowRanges(rse)
#> GRanges object with 20 ranges and 1 metadata column:
#>        seqnames        ranges strand |  feature_id
#>           <Rle>     <IRanges>  <Rle> | <character>
#>    [1]     chr1 409164-409167      - |       ID001
#>    [2]     chr1 691082-691085      + |       ID002
#>    [3]     chr1 388335-388338      + |       ID003
#>    [4]     chr1 268922-268925      - |       ID004
#>    [5]     chr1 804064-804067      - |       ID005
#>    ...      ...           ...    ... .         ...
#>   [16]     chr2 647861-647864      + |       ID016
#>   [17]     chr2 469620-469623      + |       ID017
#>   [18]     chr2 232385-232388      + |       ID018
#>   [19]     chr2 941769-941772      - |       ID019
#>   [20]     chr2 371106-371109      + |       ID020
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
SummarizedExperiment::rowData(rse)
#> DataFrame with 20 rows and 1 column
#>      feature_id
#>     <character>
#> 1         ID001
#> 2         ID002
#> 3         ID003
#> 4         ID004
#> 5         ID005
#> ...         ...
#> 16        ID016
#> 17        ID017
#> 18        ID018
#> 19        ID019
#> 20        ID020
SummarizedExperiment::colData(rse)
#> DataFrame with 4 rows and 1 column
#>     Treatment
#>   <character>
#> A        ChIP
#> B       Input
#> C        ChIP
#> D       Input

rse[, rse$Treatment == "ChIP"]
#> class: RangedSummarizedExperiment 
#> dim: 20 2 
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(2): A C
#> colData names(1): Treatment

# the same ranges but different samples:
rse1 <- rse
rse2 <- rse1[, 1:3]
colnames(rse2) <- letters[1:ncol(rse2)]
cmb1 <- cbind(rse1, rse2)
dim(cmb1)
#> [1] 20  7
dimnames(cmb1)
#> [[1]]
#> NULL
#> 
#> [[2]]
#> [1] "A" "B" "C" "D" "a" "b" "c"

# the same samples but different ranges:
rse1 <- rse
rse2 <- rse1[1:20, ]
rownames(rse2) <- letters[1:nrow(rse2)]
cmb2 <- rbind(rse1, rse2)
dim(cmb2)
#> [1] 40  4
dimnames(cmb2)
#> [[1]]
#>  [1] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "a" "b" "c" "d" "e" "f" "g" "h" "i"
#> [30] "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t"
#> 
#> [[2]]
#> [1] "A" "B" "C" "D"

se0 <- as(rse, "SummarizedExperiment")
se0
#> class: SummarizedExperiment 
#> dim: 20 4 
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(4): A B C D
#> colData names(1): Treatment
as(se0, "RangedSummarizedExperiment")
#> class: RangedSummarizedExperiment 
#> dim: 20 4 
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(4): A B C D
#> colData names(1): Treatment

# rowRanges(a SummarizedExperiment object) --> RangedSummarizedExperiment object:
se <- se0
rowRanges(se) <- rowRanges
se
#> class: RangedSummarizedExperiment 
#> dim: 20 4 
#> metadata(0):
#> assays(1): counts
#> rownames: NULL
#> rowData names(1): feature_id
#> colnames(4): A B C D
#> colData names(1): Treatment

stopifnot(identical(SummarizedExperiment::assays(se0), SummarizedExperiment::assays(rse)))
stopifnot(identical(dim(se0), dim(rse)))
stopifnot(identical(dimnames(se0), dimnames(rse)))
stopifnot(identical(SummarizedExperiment::rowData(se0), SummarizedExperiment::rowData(rse)))
stopifnot(identical(SummarizedExperiment::colData(se0), SummarizedExperiment::colData(rse)))