This function maps consequences (CSQ) of a given list of variants to those in an annotated set based on facilities such as variant effect predictor (VEP). In the case of protein-altering variants (PAVs), many consequences can be involved. The procedure also takes into account linkage disequilibrium (LD) in flanking regions.
Usage
csq(
query_loci,
annotated_loci,
pattern,
ldops = NULL,
flanking = 1e+06,
pop = "EUR",
verbose = TRUE
)
Arguments
- query_loci
A data.frame of loci whose consequences are to be obtained.
- annotated_loci
A data.frame of annotated loci.
- pattern
A character string or pattern to match the consequences against.
- ldops
Arguments for
ieugwasr::ld_matrix_local()
, typically a list withbfile
andplink
paths.- flanking
A numeric value specifying the flanking distance (default is 1e6).
- pop
A character string specifying the reference population for
ieugwasr::ld_matrix()
(default is "EUR").- verbose
A logical flag to show nonexistent variants (default is TRUE).
Examples
if (FALSE) { # \dontrun{
options(width=2000)
suppressMessages(require(dplyr))
suppressMessages(require(stringr))
# SCALLOP-INF list
METAL <- read.delim(file.path(find.package("pQTLtools"), "tests", "INF1.METAL")) %>%
dplyr::left_join(gap.datasets::inf1[c("prot", "gene")]) %>%
dplyr::mutate(prot = gene, chr = Chromosome, pos = Position)
# VEP output
vep <- "/rds/project/rds-zuZwCZMsS0w/Caprion_proteomics/analysis/bgen/vep"
pattern <- paste(protein_altering_variants, collapse = "|")
suppressMessages(require(GenomicRanges))
INF <- "/rds/project/rds-zuZwCZMsS0w/olink_proteomics/scallop/INF"
plink <- "/rds/user/jhz22/hpc-work/bin/plink"
# CSQ
b <- list()
for (i in unique(dplyr::pull(METAL, Chromosome))) {
m <- dplyr::filter(METAL, Chromosome %in% i) %>%
dplyr::select(chr, pos, MarkerName, prot) %>%
dplyr::mutate(rsid = gsub("chr", "", MarkerName)) %>%
dplyr::select(-MarkerName)
u <- read.delim(file.path(vep, paste0("chr", i, ".tab.gz"))) %>%
dplyr::select(Chrom, Pos, X.Uploaded_variation, Consequence) %>%
setNames(c("chr", "pos", "rsid", "csq"))
bfile <- file.path(INF, "INTERVAL", "per_chr", paste0("snpid", i))
b[[i]] <- csq(m, u, pattern, ldops = list(bfile = bfile, plink = plink))
}
r <- dplyr::bind_rows(b) %>%
dplyr::filter(r2 >= 0.8) %>%
dplyr::rename(gene = prot) %>%
dplyr::mutate(seqnames = as.integer(seqnames), pos = as.integer(pos)) %>%
dplyr::arrange(seqnames, pos) %>%
dplyr::select(-ref.seqnames, -ref.start, -ref.end, -seqnames, -pos, -start, -end)
w <- r %>%
group_by(gene,rsid) %>%
summarize(ref.rsid.all=paste(ref.rsid,collapse=";"),
ref.pos.all=paste(ref.pos,collapse=";"),
ref.csq.all=paste(ref.csq,collapse=";"),
r2.all=paste(r2,collapse=";"))
} # }