This repository was archived by the owner on Aug 1, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
SQL Functions
Robert A. Petit III edited this page Mar 12, 2018
·
14 revisions
Compare SNP counts of all samples against a subset of samples
CREATE OR REPLACE FUNCTION snp_counts(_sample_array int[])
RETURNS TABLE (
snp_id int,
all_count int,
subset_count bigint
) AS $func$
BEGIN
RETURN QUERY
SELECT c.snp_id, c.count, t.count_snp
FROM variant_snpcounts c
RIGHT JOIN (SELECT v.snp_id, count(v.snp_id) AS count_snp
FROM variant_tosnp v
WHERE sample_id = ANY(_sample_array)
GROUP BY v.snp_id) t
ON t.snp_id=c.snp_id;
END
$func$ LANGUAGE plpgsql;
select * from snp_counts(ARRAY(x,y,z,...));
Compare SNP counts of ST5 samples
SELECT * from snp_counts(
(SELECT array(SELECT ("sample_id")
FROM mlst_srst2
WHERE st_stripped=5 and is_exact=True)
)
);
Returns the sequencing center and count of samples.
CREATE OR REPLACE FUNCTION top_sequencing_centers(_limit int default 10, _published boolean default FALSE)
RETURNS TABLE (
sequencing_center text,
count bigint
) AS $func$
BEGIN
IF _published = TRUE THEN
RETURN QUERY
SELECT cn.name, count(cn.name) as center_count
FROM ena_centernames cn
RIGHT JOIN (
SELECT s.center_name
FROM ena_experiment s
WHERE s.experiment_accession IN (
SELECT DISTINCT p.experiment_accession FROM ena_topublication p
)
) sm
ON sm.center_name=cn.ena_name
GROUP BY cn.name
ORDER BY center_count DESC
LIMIT _limit;
ELSE
RETURN QUERY
SELECT cn.name, count(cn.name) AS center_count
FROM (
SELECT metadata->>'center_name' AS name
FROM sample_metadata
) cn
GROUP BY cn.name
ORDER BY center_count DESC
LIMIT _limit;
END IF;
END
$func$ LANGUAGE plpgsql;
select * from top_sequencing_centers();
Return top 10 sequencing centers
staphopia=> select * from top_sequencing_centers(10);
sequencing_center | count
--------------------------+-------
SC | 17237
BI | 5106
UNIVERISTY OF OXFORD | 1127
Emory University | 216
TGEN-NORTH | 115
UNIMELB | 83
TGEN | 79
University of Melbourne | 59
UBC | 49
University of Washington | 46
(10 rows)
Return the top sequencing center
staphopia=> select * from top_sequencing_centers(1);
sequencing_center | count
-------------------+-------
SC | 17237
CREATE OR REPLACE FUNCTION top_sequence_types(_limit int default 10, _published boolean default FALSE)
RETURNS TABLE (
sequence_type int,
count bigint
) AS $func$
BEGIN
IF _published = TRUE THEN
RETURN QUERY
SELECT st, count(st)
FROM mlst_mlst m
LEFT JOIN sample_sample s
ON s.id=m.sample_id
WHERE version_id=1 AND s.is_public=TRUE AND s.is_published=TRUE AND m.st > 0
GROUP BY st
ORDER BY count DESC
LIMIT _limit;
ELSE
RETURN QUERY
SELECT st, count(st)
FROM mlst_mlst m
LEFT JOIN sample_sample s
ON s.id=m.sample_id
WHERE version_id=1 AND s.is_public=TRUE AND m.st > 0
GROUP BY st
ORDER BY count DESC
LIMIT _limit;
END IF;
END
$func$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION sccmec_status_by_st(_sequence_type int)
RETURNS TABLE (
sample_tag text,
st int,
meca_coverage decimal,
is_exact boolean
) AS $func$
BEGIN
RETURN QUERY
SELECT DISTINCT ON (sa.id)
sa.sample_tag, m.st_stripped as st, s.meca_total, m.is_exact
FROM mlst_srst2 m
LEFT JOIN sccmec_coverage s
ON m.sample_id=s.sample_id
LEFT JOIN sample_metadata sa
ON m.sample_id=sa.id
WHERE m.st_stripped=_sequence_type;
END
$func$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION illumina_by_year()
RETURNS TABLE (
year int,
total bigint
) AS $func$
BEGIN
RETURN QUERY
select (first_year :: int), count(first_year)
from (
select extract(year from date(first_public)) AS first_year
from ena_run
where experiment_accession IN (
SELECT experiment_accession
from ena_experiment
where instrument_platform='ILLUMINA'
)
order by first_year
) AS first_year
group by first_year;
END
$func$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION published_illumina_by_year()
RETURNS TABLE (
year int,
total bigint
) AS $func$
BEGIN
RETURN QUERY
select (first_year :: int), count(first_year)
from (
select extract(year from date(first_public)) as first_year
from ena_run
where experiment_accession IN (
SELECT experiment_accession
from ena_experiment
where instrument_platform='ILLUMINA' AND experiment_accession IN (
SELECT distinct experiment_accession
from ena_topublication
)
)
order by first_year
) as first_year
group by first_year;
END
$func$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION cluster_counts(_sample_array int[])
RETURNS TABLE (
cluster_size bigint,
count bigint
) AS $func$
BEGIN
RETURN QUERY
SELECT r.cluster_size, count(r.cluster_size)
FROM (
SELECT count(cluster_id) AS cluster_size
FROM (
SELECT g.sample_id, g.cluster_id
FROM gene_features g
WHERE sample_id = ANY(_sample_array) AND "is_tRNA"=FALSE AND "is_rRNA"=FALSE
GROUP BY sample_id, cluster_id
) t
GROUP BY cluster_id
) r
GROUP BY r.cluster_size
ORDER by r.cluster_size ASC;
END
$func$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION assembly_stats_by_year(_scaffolds boolean default FALSE,
_plasmids boolean default FALSE)
RETURNS TABLE (
year numeric,
is_scaffolds boolean,
is_plasmids boolean,
total bigint,
total_contig numeric,
total_contig_length numeric,
min_contig_length numeric,
median_contig_length numeric,
mean_contig_length numeric,
max_contig_length numeric,
n50_contig_length numeric,
l50_contig_count numeric,
ng50_contig_length numeric,
lg50_contig_count numeric,
contigs_greater_1k numeric,
contigs_greater_10k numeric,
contigs_greater_100k numeric,
contigs_greater_1m numeric,
percent_contigs_greater_1k numeric,
percent_contigs_greater_10k numeric,
percent_contigs_greater_100k numeric,
percent_contigs_greater_1m numeric,
contig_percent_a numeric,
contig_percent_t numeric,
contig_percent_g numeric,
contig_percent_c numeric,
contig_percent_n numeric,
contig_non_acgtn numeric,
num_contig_non_acgtn numeric
) AS $func$
BEGIN
RETURN QUERY
SELECT EXTRACT(YEAR FROM m.first_public)::numeric AS year,
_scaffolds AS is_scaffolds,
_plasmids AS is_plasmids,
COUNT(EXTRACT(YEAR FROM m.first_public)) AS total,
ROUND(AVG(s.total_contig)::numeric,0) AS total_contig,
ROUND(AVG(s.total_contig_length)::numeric,0) AS total_contig_length,
ROUND(AVG(s.min_contig_length)::numeric,0) AS min_contig_length,
ROUND(AVG(s.median_contig_length)::numeric,0) AS median_contig_length,
ROUND(AVG(s.mean_contig_length)::numeric,0) AS mean_contig_length,
ROUND(AVG(s.max_contig_length)::numeric,0) AS max_contig_length,
ROUND(AVG(s.n50_contig_length)::numeric,0) AS n50_contig_length,
ROUND(AVG(s.l50_contig_count)::numeric,0) AS l50_contig_count,
ROUND(AVG(s.ng50_contig_length)::numeric,0) AS ng50_contig_length,
ROUND(AVG(s.lg50_contig_count)::numeric,0) AS lg50_contig_count,
ROUND(AVG(s.contigs_greater_1k)::numeric,0) AS contigs_greater_1k,
ROUND(AVG(s.contigs_greater_10k)::numeric,0) AS contigs_greater_10k,
ROUND(AVG(s.contigs_greater_100k)::numeric,0) AS contigs_greater_100k,
ROUND(AVG(s.contigs_greater_1m)::numeric,0) AS contigs_greater_1m,
ROUND(AVG(s.percent_contigs_greater_1k)::numeric,2) AS percent_contigs_greater_1k,
ROUND(AVG(s.percent_contigs_greater_10k)::numeric,2) AS percent_contigs_greater_10k,
ROUND(AVG(s.percent_contigs_greater_100k)::numeric,2) AS percent_contigs_greater_100k,
ROUND(AVG(s.percent_contigs_greater_1m)::numeric,2) AS percent_contigs_greater_1m,
ROUND(AVG(s.contig_percent_a)::numeric,2) AS contig_percent_a,
ROUND(AVG(s.contig_percent_t)::numeric,2) AS contig_percent_t,
ROUND(AVG(s.contig_percent_g)::numeric,2) AS contig_percent_g,
ROUND(AVG(s.contig_percent_c)::numeric,2) AS contig_percent_c,
ROUND(AVG(s.contig_percent_n)::numeric,2) AS contig_percent_n,
ROUND(AVG(s.contig_non_acgtn)::numeric,2) AS contig_non_acgtn,
ROUND(AVG(s.num_contig_non_acgtn)::numeric,0) AS num_contig_non_acgtn
FROM sample_metadata m
LEFT JOIN assembly_stats s
ON s.sample_id=m.sample_id
WHERE s.is_scaffolds=_scaffolds AND s.is_plasmids=_plasmids
GROUP BY year
ORDER BY year ASC;
END
$func$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION sequencing_stats_by_year(_original boolean default FALSE)
RETURNS TABLE (
year numeric,
is_original boolean,
total bigint,
total_bp numeric,
coverage numeric,
read_total numeric,
read_min numeric,
read_mean numeric,
read_std numeric,
read_median numeric,
read_max numeric,
read_25th numeric,
read_75th numeric,
qual_mean numeric,
qual_std numeric,
qual_median numeric,
qual_25th numeric,
qual_75th numeric
) AS $func$
BEGIN
RETURN QUERY
SELECT EXTRACT(YEAR FROM m.first_public)::numeric AS year,
_original AS is_original,
COUNT(EXTRACT(YEAR FROM m.first_public)) AS total,
ROUND(AVG(s.total_bp)::numeric,0) AS total_bp,
ROUND(AVG(s.coverage)::numeric,2) AS coverage,
ROUND(AVG(s.read_total)::numeric,0) AS read_total,
ROUND(AVG(s.read_min)::numeric,0) AS read_min,
ROUND(AVG(s.read_mean)::numeric,0) AS read_mean,
ROUND(AVG(s.read_std)::numeric,2) AS read_std,
ROUND(AVG(s.read_median)::numeric,0) AS read_median,
ROUND(AVG(s.read_max)::numeric,0) AS read_max,
ROUND(AVG(s.read_25th)::numeric,0) AS read_25th,
ROUND(AVG(s.read_75th)::numeric,0) AS read_75th,
ROUND(AVG(s.qual_mean)::numeric,2) AS qual_mean,
ROUND(AVG(s.qual_std)::numeric,2) AS qual_std,
ROUND(AVG(s.qual_median)::numeric,2) AS qual_median,
ROUND(AVG(s.qual_25th)::numeric,2) AS qual_25th,
ROUND(AVG(s.qual_75th)::numeric,2) AS qual_75th
FROM sample_metadata m
LEFT JOIN sequence_stat s
ON s.sample_id=m.sample_id
WHERE s.is_original=_original
GROUP BY year
ORDER BY year ASC;
END
$func$ LANGUAGE plpgsql;