Skip to content
This repository was archived by the owner on Aug 1, 2023. It is now read-only.

SQL Functions

Robert A. Petit III edited this page Mar 12, 2018 · 14 revisions

Functions

snp_counts

top_sequencing_centers

top_sequence_types

sccmec_status_by_st


snp_counts

Compare SNP counts of all samples against a subset of samples

Function Code

CREATE OR REPLACE FUNCTION snp_counts(_sample_array int[])
    RETURNS TABLE (
        snp_id   int,
        all_count   int,
        subset_count   bigint
    ) AS $func$
BEGIN

   RETURN QUERY
   SELECT c.snp_id, c.count, t.count_snp 
   FROM variant_snpcounts c 
   RIGHT JOIN (SELECT v.snp_id, count(v.snp_id) AS count_snp 
               FROM variant_tosnp v
               WHERE sample_id = ANY(_sample_array)
               GROUP BY v.snp_id) t
   ON t.snp_id=c.snp_id;

END
$func$  LANGUAGE plpgsql;

Example usage
select * from snp_counts(ARRAY(x,y,z,...));

Compare SNP counts of ST5 samples

SELECT * from snp_counts(
    (SELECT array(SELECT ("sample_id") 
                  FROM mlst_srst2 
                  WHERE st_stripped=5 and is_exact=True)
    )
);

top_sequencing_centers

Returns the sequencing center and count of samples.

Function Code
CREATE OR REPLACE FUNCTION top_sequencing_centers(_limit int default 10, _published boolean default FALSE)
    RETURNS TABLE (
        sequencing_center   text,
        count   bigint
    ) AS $func$
BEGIN
   IF _published = TRUE THEN
        RETURN QUERY
        SELECT cn.name, count(cn.name) as center_count
        FROM ena_centernames cn
        RIGHT JOIN (
            SELECT s.center_name
            FROM ena_experiment s
            WHERE s.experiment_accession IN (
                SELECT DISTINCT p.experiment_accession FROM ena_topublication p
            )
        ) sm
        ON sm.center_name=cn.ena_name
        GROUP BY cn.name
        ORDER BY center_count DESC
        LIMIT _limit;
   ELSE
        RETURN QUERY
        SELECT cn.name, count(cn.name) AS center_count
        FROM (
          SELECT metadata->>'center_name' AS name
          FROM sample_metadata
        ) cn
        GROUP BY cn.name
        ORDER BY center_count DESC
        LIMIT _limit;
   END IF;
END
$func$  LANGUAGE plpgsql;
Example Usage
select * from top_sequencing_centers();

Return top 10 sequencing centers

staphopia=> select * from top_sequencing_centers(10);
    sequencing_center     | count
--------------------------+-------
 SC                       | 17237
 BI                       |  5106
 UNIVERISTY OF OXFORD     |  1127
 Emory University         |   216
 TGEN-NORTH               |   115
 UNIMELB                  |    83
 TGEN                     |    79
 University of Melbourne  |    59
 UBC                      |    49
 University of Washington |    46
(10 rows)

Return the top sequencing center

staphopia=> select * from top_sequencing_centers(1);
 sequencing_center | count
-------------------+-------
 SC                | 17237

top_sequence_types

Function Code
CREATE OR REPLACE FUNCTION top_sequence_types(_limit int default 10, _published boolean default FALSE)
    RETURNS TABLE (
        sequence_type   int,
        count   bigint
    ) AS $func$
BEGIN
    IF _published = TRUE THEN
        RETURN QUERY
        SELECT st, count(st)
        FROM mlst_mlst m
        LEFT JOIN sample_sample s
        ON s.id=m.sample_id
        WHERE version_id=1 AND s.is_public=TRUE AND s.is_published=TRUE AND m.st > 0
        GROUP BY st
        ORDER BY count DESC
        LIMIT _limit;
    ELSE
        RETURN QUERY
        SELECT st, count(st)
        FROM mlst_mlst m
        LEFT JOIN sample_sample s
        ON s.id=m.sample_id
        WHERE version_id=1 AND s.is_public=TRUE AND m.st > 0
        GROUP BY st
        ORDER BY count DESC
        LIMIT _limit;
    END IF;

END
$func$  LANGUAGE plpgsql;

sccmec_status_by_st

Function Code
CREATE OR REPLACE FUNCTION sccmec_status_by_st(_sequence_type int)
    RETURNS TABLE (
        sample_tag   text,
        st   int,
        meca_coverage   decimal,
        is_exact   boolean
    ) AS $func$
BEGIN

    RETURN QUERY
    SELECT DISTINCT ON (sa.id) 
        sa.sample_tag, m.st_stripped as st, s.meca_total, m.is_exact
    FROM mlst_srst2 m
    LEFT JOIN sccmec_coverage s
    ON m.sample_id=s.sample_id
    LEFT JOIN sample_metadata sa
    ON m.sample_id=sa.id
    WHERE m.st_stripped=_sequence_type;

END
$func$  LANGUAGE plpgsql;

Illumina Total By Year

CREATE OR REPLACE FUNCTION illumina_by_year()
    RETURNS TABLE (
        year   int,
        total   bigint
    ) AS $func$
BEGIN
    RETURN QUERY
    select (first_year :: int), count(first_year) 
    from (
        select extract(year from date(first_public)) AS first_year
        from ena_run 
        where experiment_accession IN (
            SELECT experiment_accession 
            from ena_experiment 
            where instrument_platform='ILLUMINA'
        )
        order by first_year
    ) AS first_year 
    group by first_year;
END
$func$  LANGUAGE plpgsql;

Published Illumina By Year

CREATE OR REPLACE FUNCTION published_illumina_by_year()
    RETURNS TABLE (
        year   int,
        total   bigint
    ) AS $func$
BEGIN
    RETURN QUERY
    select (first_year :: int), count(first_year) 
    from (
        select extract(year from date(first_public)) as first_year
        from ena_run 
        where experiment_accession IN (
            SELECT experiment_accession 
            from ena_experiment 
            where instrument_platform='ILLUMINA' AND experiment_accession IN (
                SELECT distinct experiment_accession 
                from ena_topublication
            )
        )
        order by first_year
    ) as first_year 
    group by first_year;
END
$func$  LANGUAGE plpgsql;

Cluster Counts

CREATE OR REPLACE FUNCTION cluster_counts(_sample_array int[])
    RETURNS TABLE (
        cluster_size   bigint,
        count   bigint
    ) AS $func$
BEGIN

    RETURN QUERY
    SELECT r.cluster_size, count(r.cluster_size)
    FROM (
        SELECT count(cluster_id) AS cluster_size
        FROM (
            SELECT g.sample_id, g.cluster_id
            FROM gene_features g
            WHERE sample_id = ANY(_sample_array) AND "is_tRNA"=FALSE AND "is_rRNA"=FALSE
            GROUP BY sample_id, cluster_id
        ) t
        GROUP BY cluster_id
    ) r
    GROUP BY r.cluster_size
    ORDER by r.cluster_size ASC;

END
$func$  LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION assembly_stats_by_year(_scaffolds boolean default FALSE,
                                                   _plasmids boolean default FALSE)
    RETURNS TABLE (
        year   numeric,
        is_scaffolds   boolean,
        is_plasmids   boolean,
        total   bigint,
        total_contig   numeric,
        total_contig_length   numeric,
        min_contig_length   numeric,
        median_contig_length   numeric,
        mean_contig_length   numeric,
        max_contig_length   numeric,
        n50_contig_length   numeric,
        l50_contig_count   numeric,
        ng50_contig_length   numeric,
        lg50_contig_count   numeric,
        contigs_greater_1k   numeric,
        contigs_greater_10k   numeric,
        contigs_greater_100k   numeric,
        contigs_greater_1m   numeric,
        percent_contigs_greater_1k   numeric,
        percent_contigs_greater_10k   numeric,
        percent_contigs_greater_100k   numeric,
        percent_contigs_greater_1m   numeric,
        contig_percent_a   numeric,
        contig_percent_t   numeric,
        contig_percent_g   numeric,
        contig_percent_c   numeric,
        contig_percent_n   numeric,
        contig_non_acgtn   numeric,
        num_contig_non_acgtn   numeric
    ) AS $func$
BEGIN
    RETURN QUERY
    SELECT EXTRACT(YEAR FROM m.first_public)::numeric AS year,
        _scaffolds AS is_scaffolds,
        _plasmids AS is_plasmids,
        COUNT(EXTRACT(YEAR FROM m.first_public)) AS total,
        ROUND(AVG(s.total_contig)::numeric,0) AS total_contig,
        ROUND(AVG(s.total_contig_length)::numeric,0) AS total_contig_length,
        ROUND(AVG(s.min_contig_length)::numeric,0) AS min_contig_length,
        ROUND(AVG(s.median_contig_length)::numeric,0) AS median_contig_length,
        ROUND(AVG(s.mean_contig_length)::numeric,0) AS mean_contig_length,
        ROUND(AVG(s.max_contig_length)::numeric,0) AS max_contig_length,
        ROUND(AVG(s.n50_contig_length)::numeric,0) AS n50_contig_length,
        ROUND(AVG(s.l50_contig_count)::numeric,0) AS l50_contig_count,
        ROUND(AVG(s.ng50_contig_length)::numeric,0) AS ng50_contig_length,
        ROUND(AVG(s.lg50_contig_count)::numeric,0) AS lg50_contig_count,
        ROUND(AVG(s.contigs_greater_1k)::numeric,0) AS contigs_greater_1k,
        ROUND(AVG(s.contigs_greater_10k)::numeric,0) AS contigs_greater_10k,
        ROUND(AVG(s.contigs_greater_100k)::numeric,0) AS contigs_greater_100k,
        ROUND(AVG(s.contigs_greater_1m)::numeric,0) AS contigs_greater_1m,
        ROUND(AVG(s.percent_contigs_greater_1k)::numeric,2) AS percent_contigs_greater_1k,
        ROUND(AVG(s.percent_contigs_greater_10k)::numeric,2) AS percent_contigs_greater_10k,
        ROUND(AVG(s.percent_contigs_greater_100k)::numeric,2) AS percent_contigs_greater_100k,
        ROUND(AVG(s.percent_contigs_greater_1m)::numeric,2)  AS percent_contigs_greater_1m,
        ROUND(AVG(s.contig_percent_a)::numeric,2) AS contig_percent_a,
        ROUND(AVG(s.contig_percent_t)::numeric,2) AS contig_percent_t,
        ROUND(AVG(s.contig_percent_g)::numeric,2) AS contig_percent_g,
        ROUND(AVG(s.contig_percent_c)::numeric,2)  AS contig_percent_c,
        ROUND(AVG(s.contig_percent_n)::numeric,2) AS contig_percent_n,
        ROUND(AVG(s.contig_non_acgtn)::numeric,2) AS contig_non_acgtn,
        ROUND(AVG(s.num_contig_non_acgtn)::numeric,0) AS num_contig_non_acgtn
    FROM sample_metadata m
    LEFT JOIN assembly_stats s
    ON s.sample_id=m.sample_id
    WHERE s.is_scaffolds=_scaffolds AND s.is_plasmids=_plasmids
    GROUP BY year 
    ORDER BY year ASC;
END
$func$  LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION sequencing_stats_by_year(_original boolean default FALSE)
    RETURNS TABLE (
        year   numeric,
        is_original   boolean,
        total   bigint,
        total_bp   numeric,
        coverage   numeric,
        read_total   numeric,
        read_min   numeric,
        read_mean   numeric,
        read_std   numeric,
        read_median   numeric,
        read_max   numeric,
        read_25th   numeric,
        read_75th   numeric,
        qual_mean   numeric,
        qual_std   numeric,
        qual_median   numeric,
        qual_25th   numeric,
        qual_75th   numeric
    ) AS $func$
BEGIN
    RETURN QUERY
    SELECT EXTRACT(YEAR FROM m.first_public)::numeric AS year,
        _original AS is_original,
        COUNT(EXTRACT(YEAR FROM m.first_public)) AS total,
        ROUND(AVG(s.total_bp)::numeric,0) AS total_bp,
        ROUND(AVG(s.coverage)::numeric,2) AS coverage,
        ROUND(AVG(s.read_total)::numeric,0) AS read_total,
        ROUND(AVG(s.read_min)::numeric,0) AS read_min,
        ROUND(AVG(s.read_mean)::numeric,0) AS read_mean,
        ROUND(AVG(s.read_std)::numeric,2) AS read_std,
        ROUND(AVG(s.read_median)::numeric,0) AS read_median,
        ROUND(AVG(s.read_max)::numeric,0) AS read_max,
        ROUND(AVG(s.read_25th)::numeric,0) AS read_25th,
        ROUND(AVG(s.read_75th)::numeric,0) AS read_75th,
        ROUND(AVG(s.qual_mean)::numeric,2) AS qual_mean,
        ROUND(AVG(s.qual_std)::numeric,2) AS qual_std,
        ROUND(AVG(s.qual_median)::numeric,2) AS qual_median,
        ROUND(AVG(s.qual_25th)::numeric,2)  AS qual_25th,
        ROUND(AVG(s.qual_75th)::numeric,2) AS qual_75th
    FROM sample_metadata m
    LEFT JOIN sequence_stat s
    ON s.sample_id=m.sample_id
    WHERE s.is_original=_original
    GROUP BY year 
    ORDER BY year ASC;
END
$func$  LANGUAGE plpgsql;

Clone this wiki locally