SQL Functions

Functions

snp_counts

Compare SNP counts of all samples against a subset of samples

Function Code


CREATE OR REPLACE FUNCTION snp_counts(_sample_array int[])
    RETURNS TABLE (
        snp_id   int,
        all_count   int,
        subset_count   bigint
    ) AS $func$
BEGIN

   RETURN QUERY
   SELECT c.snp_id, c.count, t.count_snp 
   FROM variant_snpcounts c 
   RIGHT JOIN (SELECT v.snp_id, count(v.snp_id) AS count_snp 
               FROM variant_tosnp v
               WHERE sample_id = ANY(_sample_array)
               GROUP BY v.snp_id) t
   ON t.snp_id=c.snp_id;

END
$func$  LANGUAGE plpgsql;

Example usage

select * from snp_counts(ARRAY(x,y,z,...));

Compare SNP counts of ST5 samples

SELECT * from snp_counts(
    (SELECT array(SELECT ("sample_id") 
                  FROM mlst_srst2 
                  WHERE st_stripped=5 and is_exact=True)
    )
);

top_sequencing_centers

Returns the sequencing center and count of samples.

Function Code

CREATE OR REPLACE FUNCTION top_sequencing_centers(_limit int default 10, _published boolean default FALSE)
    RETURNS TABLE (
        sequencing_center   text,
        count   bigint
    ) AS $func$
BEGIN
   IF _published = TRUE THEN
        RETURN QUERY
        SELECT cn.name, count(cn.name) as center_count
        FROM ena_centernames cn
        RIGHT JOIN (
            SELECT s.center_name
            FROM ena_experiment s
            WHERE s.experiment_accession IN (
                SELECT DISTINCT p.experiment_accession FROM ena_topublication p
            )
        ) sm
        ON sm.center_name=cn.ena_name
        GROUP BY cn.name
        ORDER BY center_count DESC
        LIMIT _limit;
   ELSE
        RETURN QUERY
        SELECT cn.name, count(cn.name) AS center_count
        FROM (
          SELECT metadata->>'center_name' AS name
          FROM sample_metadata
        ) cn
        GROUP BY cn.name
        ORDER BY center_count DESC
        LIMIT _limit;
   END IF;
END
$func$  LANGUAGE plpgsql;

Example Usage

select * from top_sequencing_centers();

Return top 10 sequencing centers

staphopia=> select * from top_sequencing_centers(10);
    sequencing_center     | count
--------------------------+-------
 SC                       | 17237
 BI                       |  5106
 UNIVERISTY OF OXFORD     |  1127
 Emory University         |   216
 TGEN-NORTH               |   115
 UNIMELB                  |    83
 TGEN                     |    79
 University of Melbourne  |    59
 UBC                      |    49
 University of Washington |    46
(10 rows)

Return the top sequencing center

staphopia=> select * from top_sequencing_centers(1);
 sequencing_center | count
-------------------+-------
 SC                | 17237

top_sequence_types

Function Code

CREATE OR REPLACE FUNCTION top_sequence_types(_limit int default 10, _published boolean default FALSE)
    RETURNS TABLE (
        sequence_type   int,
        count   bigint
    ) AS $func$
BEGIN
    IF _published = TRUE THEN
        RETURN QUERY
        SELECT st, count(st)
        FROM mlst_mlst m
        LEFT JOIN sample_sample s
        ON s.id=m.sample_id
        WHERE version_id=1 AND s.is_public=TRUE AND s.is_published=TRUE AND m.st > 0
        GROUP BY st
        ORDER BY count DESC
        LIMIT _limit;
    ELSE
        RETURN QUERY
        SELECT st, count(st)
        FROM mlst_mlst m
        LEFT JOIN sample_sample s
        ON s.id=m.sample_id
        WHERE version_id=1 AND s.is_public=TRUE AND m.st > 0
        GROUP BY st
        ORDER BY count DESC
        LIMIT _limit;
    END IF;

END
$func$  LANGUAGE plpgsql;

sccmec_status_by_st

Function Code

CREATE OR REPLACE FUNCTION sccmec_status_by_st(_sequence_type int)
    RETURNS TABLE (
        sample_tag   text,
        st   int,
        meca_coverage   decimal,
        is_exact   boolean
    ) AS $func$
BEGIN

    RETURN QUERY
    SELECT DISTINCT ON (sa.id) 
        sa.sample_tag, m.st_stripped as st, s.meca_total, m.is_exact
    FROM mlst_srst2 m
    LEFT JOIN sccmec_coverage s
    ON m.sample_id=s.sample_id
    LEFT JOIN sample_metadata sa
    ON m.sample_id=sa.id
    WHERE m.st_stripped=_sequence_type;

END
$func$  LANGUAGE plpgsql;

Illumina Total By Year

CREATE OR REPLACE FUNCTION illumina_by_year()
    RETURNS TABLE (
        year   int,
        total   bigint
    ) AS $func$
BEGIN
    RETURN QUERY
    select (first_year :: int), count(first_year) 
    from (
        select extract(year from date(first_public)) AS first_year
        from ena_run 
        where experiment_accession IN (
            SELECT experiment_accession 
            from ena_experiment 
            where instrument_platform='ILLUMINA'
        )
        order by first_year
    ) AS first_year 
    group by first_year;
END
$func$  LANGUAGE plpgsql;

Published Illumina By Year

CREATE OR REPLACE FUNCTION published_illumina_by_year()
    RETURNS TABLE (
        year   int,
        total   bigint
    ) AS $func$
BEGIN
    RETURN QUERY
    select (first_year :: int), count(first_year) 
    from (
        select extract(year from date(first_public)) as first_year
        from ena_run 
        where experiment_accession IN (
            SELECT experiment_accession 
            from ena_experiment 
            where instrument_platform='ILLUMINA' AND experiment_accession IN (
                SELECT distinct experiment_accession 
                from ena_topublication
            )
        )
        order by first_year
    ) as first_year 
    group by first_year;
END
$func$  LANGUAGE plpgsql;

Cluster Counts

CREATE OR REPLACE FUNCTION cluster_counts(_sample_array int[])
    RETURNS TABLE (
        cluster_size   bigint,
        count   bigint
    ) AS $func$
BEGIN

    RETURN QUERY
    SELECT r.cluster_size, count(r.cluster_size)
    FROM (
        SELECT count(cluster_id) AS cluster_size
        FROM (
            SELECT g.sample_id, g.cluster_id
            FROM gene_features g
            WHERE sample_id = ANY(_sample_array) AND "is_tRNA"=FALSE AND "is_rRNA"=FALSE
            GROUP BY sample_id, cluster_id
        ) t
        GROUP BY cluster_id
    ) r
    GROUP BY r.cluster_size
    ORDER by r.cluster_size ASC;

END
$func$  LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION assembly_stats_by_year(_scaffolds boolean default FALSE,
                                                   _plasmids boolean default FALSE)
    RETURNS TABLE (
        year   numeric,
        is_scaffolds   boolean,
        is_plasmids   boolean,
        total   bigint,
        total_contig   numeric,
        total_contig_length   numeric,
        min_contig_length   numeric,
        median_contig_length   numeric,
        mean_contig_length   numeric,
        max_contig_length   numeric,
        n50_contig_length   numeric,
        l50_contig_count   numeric,
        ng50_contig_length   numeric,
        lg50_contig_count   numeric,
        contigs_greater_1k   numeric,
        contigs_greater_10k   numeric,
        contigs_greater_100k   numeric,
        contigs_greater_1m   numeric,
        percent_contigs_greater_1k   numeric,
        percent_contigs_greater_10k   numeric,
        percent_contigs_greater_100k   numeric,
        percent_contigs_greater_1m   numeric,
        contig_percent_a   numeric,
        contig_percent_t   numeric,
        contig_percent_g   numeric,
        contig_percent_c   numeric,
        contig_percent_n   numeric,
        contig_non_acgtn   numeric,
        num_contig_non_acgtn   numeric
    ) AS $func$
BEGIN
    RETURN QUERY
    SELECT EXTRACT(YEAR FROM m.first_public)::numeric AS year,
        _scaffolds AS is_scaffolds,
        _plasmids AS is_plasmids,
        COUNT(EXTRACT(YEAR FROM m.first_public)) AS total,
        ROUND(AVG(s.total_contig)::numeric,0) AS total_contig,
        ROUND(AVG(s.total_contig_length)::numeric,0) AS total_contig_length,
        ROUND(AVG(s.min_contig_length)::numeric,0) AS min_contig_length,
        ROUND(AVG(s.median_contig_length)::numeric,0) AS median_contig_length,
        ROUND(AVG(s.mean_contig_length)::numeric,0) AS mean_contig_length,
        ROUND(AVG(s.max_contig_length)::numeric,0) AS max_contig_length,
        ROUND(AVG(s.n50_contig_length)::numeric,0) AS n50_contig_length,
        ROUND(AVG(s.l50_contig_count)::numeric,0) AS l50_contig_count,
        ROUND(AVG(s.ng50_contig_length)::numeric,0) AS ng50_contig_length,
        ROUND(AVG(s.lg50_contig_count)::numeric,0) AS lg50_contig_count,
        ROUND(AVG(s.contigs_greater_1k)::numeric,0) AS contigs_greater_1k,
        ROUND(AVG(s.contigs_greater_10k)::numeric,0) AS contigs_greater_10k,
        ROUND(AVG(s.contigs_greater_100k)::numeric,0) AS contigs_greater_100k,
        ROUND(AVG(s.contigs_greater_1m)::numeric,0) AS contigs_greater_1m,
        ROUND(AVG(s.percent_contigs_greater_1k)::numeric,2) AS percent_contigs_greater_1k,
        ROUND(AVG(s.percent_contigs_greater_10k)::numeric,2) AS percent_contigs_greater_10k,
        ROUND(AVG(s.percent_contigs_greater_100k)::numeric,2) AS percent_contigs_greater_100k,
        ROUND(AVG(s.percent_contigs_greater_1m)::numeric,2)  AS percent_contigs_greater_1m,
        ROUND(AVG(s.contig_percent_a)::numeric,2) AS contig_percent_a,
        ROUND(AVG(s.contig_percent_t)::numeric,2) AS contig_percent_t,
        ROUND(AVG(s.contig_percent_g)::numeric,2) AS contig_percent_g,
        ROUND(AVG(s.contig_percent_c)::numeric,2)  AS contig_percent_c,
        ROUND(AVG(s.contig_percent_n)::numeric,2) AS contig_percent_n,
        ROUND(AVG(s.contig_non_acgtn)::numeric,2) AS contig_non_acgtn,
        ROUND(AVG(s.num_contig_non_acgtn)::numeric,0) AS num_contig_non_acgtn
    FROM sample_metadata m
    LEFT JOIN assembly_stats s
    ON s.sample_id=m.sample_id
    WHERE s.is_scaffolds=_scaffolds AND s.is_plasmids=_plasmids
    GROUP BY year 
    ORDER BY year ASC;
END
$func$  LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION sequencing_stats_by_year(_original boolean default FALSE)
    RETURNS TABLE (
        year   numeric,
        is_original   boolean,
        total   bigint,
        total_bp   numeric,
        coverage   numeric,
        read_total   numeric,
        read_min   numeric,
        read_mean   numeric,
        read_std   numeric,
        read_median   numeric,
        read_max   numeric,
        read_25th   numeric,
        read_75th   numeric,
        qual_mean   numeric,
        qual_std   numeric,
        qual_median   numeric,
        qual_25th   numeric,
        qual_75th   numeric
    ) AS $func$
BEGIN
    RETURN QUERY
    SELECT EXTRACT(YEAR FROM m.first_public)::numeric AS year,
        _original AS is_original,
        COUNT(EXTRACT(YEAR FROM m.first_public)) AS total,
        ROUND(AVG(s.total_bp)::numeric,0) AS total_bp,
        ROUND(AVG(s.coverage)::numeric,2) AS coverage,
        ROUND(AVG(s.read_total)::numeric,0) AS read_total,
        ROUND(AVG(s.read_min)::numeric,0) AS read_min,
        ROUND(AVG(s.read_mean)::numeric,0) AS read_mean,
        ROUND(AVG(s.read_std)::numeric,2) AS read_std,
        ROUND(AVG(s.read_median)::numeric,0) AS read_median,
        ROUND(AVG(s.read_max)::numeric,0) AS read_max,
        ROUND(AVG(s.read_25th)::numeric,0) AS read_25th,
        ROUND(AVG(s.read_75th)::numeric,0) AS read_75th,
        ROUND(AVG(s.qual_mean)::numeric,2) AS qual_mean,
        ROUND(AVG(s.qual_std)::numeric,2) AS qual_std,
        ROUND(AVG(s.qual_median)::numeric,2) AS qual_median,
        ROUND(AVG(s.qual_25th)::numeric,2)  AS qual_25th,
        ROUND(AVG(s.qual_75th)::numeric,2) AS qual_75th
    FROM sample_metadata m
    LEFT JOIN sequence_stat s
    ON s.sample_id=m.sample_id
    WHERE s.is_original=_original
    GROUP BY year 
    ORDER BY year ASC;
END
$func$  LANGUAGE plpgsql;

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

SQL Functions

Functions

snp_counts

Function Code

Example usage

top_sequencing_centers

Function Code

Example Usage

top_sequence_types

Function Code

sccmec_status_by_st

Function Code

Illumina Total By Year

Published Illumina By Year

Cluster Counts

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally