diff --git a/.gitignore b/.gitignore index f52b0de..950912c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ #PyCharm workdir .idea - +*.pyc #Scripts output directory content -output/* +output/*.json diff --git a/IHEC_json_converter/bisulfite.py b/IHEC_json_converter/bisulfite.py index 6997f13..ee16540 100755 --- a/IHEC_json_converter/bisulfite.py +++ b/IHEC_json_converter/bisulfite.py @@ -1,29 +1,21 @@ __author__ = 'kelley' -import json -from general import convert_to_IHEC_format VERSION='1.6' -def bisulfite_wrapper(assembly, taxon_id): - url = 'https://www.encodeproject.org/search/?type=experiment&assay_term_name=whole-genome%20shotgun%20bisulfite%20sequencing' +# Used to set is_main +BISULFATE_TRACK_HIEARCHY = {'methylation_profile': ['methylation state at CpG', 'signal']} - # Used to set is_main - track_hierarchy = {'methylation_profile': ['methylation state at CpG', 'methylation state at CHH']} - def dataset_additions_f(experiment, json_object): +def bisulfate_addition(experiment, json_object): + #Set experiment_type + json_object['experiment_attributes']['experiment_type'] = 'DNA Methylation' + json_object['experiment_attributes']['assay_type'] = 'WGB-Seq' - #Set experiment_type - json_object['experiment_attributes']['experiment_type'] = 'DNA Methylation' - json_object['experiment_attributes']['assay_type'] = 'WGB-Seq' + return json_object - return json_object - return convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, dataset_additions_f) - - - -if __name__ == "__main__": - data = bisulfite_wrapper(assembly='hg19', taxon_id=9606) - with open('../output/bisulfite_v%s.json' % VERSION, 'w+') as outfile: - json.dump(data, outfile, indent=4) \ No newline at end of file +# if __name__ == "__main__": +# data = bisulfite_wrapper(assembly='hg19', taxon_id=9606) +# with open('../output/bisulfite_v%s.json' % VERSION, 'w+') as outfile: +# json.dump(data, outfile, indent=4) \ No newline at end of file diff --git a/IHEC_json_converter/chipseq.py b/IHEC_json_converter/chipseq.py index 65b9ce5..b624cd9 100755 --- a/IHEC_json_converter/chipseq.py +++ b/IHEC_json_converter/chipseq.py @@ -1,34 +1,26 @@ __author__ = 'kelley' -import json -from general import convert_to_IHEC_format VERSION='1.6' -def chip_seq_wrapper(assembly, taxon_id, target): - url = 'https://www.encodeproject.org/search/?type=experiment&assay_term_name=ChIP-seq&target.name=%s-human' % target - - # Used to set is_main - track_hierarchy = {'peak_calls': ['optimal idr thresholded peaks', 'conservative idr thresholded peaks', +CHIPSEQ_TRACK_HIEARCHY = {'peak_calls': ['optimal idr thresholded peaks', 'conservative idr thresholded peaks', 'replicated peaks', 'peaks', 'hotspots'], 'signal': ['signal p-value', 'fold change over control', 'signal', 'raw signal']} - def dataset_additions_f(experiment, json_object): - - #Set experiment_type - json_object['experiment_attributes']['experiment_type'] = experiment['target']['label'] - return json_object +def chip_seq_addition(experiment, json_object): - return convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, dataset_additions_f) + #Set experiment_type + json_object['experiment_attributes']['experiment_type'] = experiment['target']['label'] + return json_object -if __name__ == "__main__": - targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] - for t in targets: - data = chip_seq_wrapper(assembly='hg19', taxon_id=9606, target=t) - with open('../output/%s_v%s.json' % (t, VERSION), 'w+') as outfile: - json.dump(data, outfile, indent=4) +# if __name__ == "__main__": +# targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] +# for t in targets: +# data = chip_seq_wrapper(assembly='hg19', taxon_id=9606, target=t) +# with open('../output/%s_v%s.json' % (t, VERSION), 'w+') as outfile: +# json.dump(data, outfile, indent=4) diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index e646b8f..e2c08fc 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -2,7 +2,7 @@ import getopt import json from datetime import datetime -import rnaseq, bisulfite, chipseq +from reference_epigenome_experiments import collect_experiments def main(argv): opts, args = getopt.getopt(argv, "", ["assembly=", "taxon-id="]) @@ -22,43 +22,9 @@ def main(argv): date_str = datetime.now().date() - #Todo: Merge experiments as a single JSON - - #Whole-Genome Bisulfite Sequencing experiments - print("Processing WGB-Seq...") - try: - data = bisulfite.bisulfite_wrapper(assembly='hg19', taxon_id=9606) - filename = 'WGB-Seq_%s_%s_%s.json' % (taxon_id, assembly, date_str) - output_file(data, filename) - print("Done.") - except Exception as e: - print('An error occured while fetching WGB-Seq experiments: ' + e.message) - print - - #RNA-Sequencing experiments - print("Processing RNA-Seq...") - try: - data = rnaseq.rna_seq_wrapper(assembly=assembly, taxon_id=taxon_id) - filename = 'RNA-Seq_%s_%s_%s.json' % (taxon_id, assembly, date_str) - output_file(data, filename) - print("Done.") - except Exception as e: - print('An error occured while fetching RNA-Seq experiments: ' + e.message) - print - - #ChIP-Seq experiments - targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] - for t in targets: - print("Processing ChIP-Seq %s..." % t) - try: - data = chipseq.chip_seq_wrapper(assembly='hg19', taxon_id=9606, target=t) - filename = 'ChIP-Seq_%s_%s_%s_%s.json' % (taxon_id, assembly, t, date_str) - output_file(data, filename) - print("Done.") - except Exception as e: - print('An error occured while fetching ChIP-Seq %s experiments: ' % t + e.message) - print - print("Operation completed.") + filename = 'ENCODE.{}.{}.{}.json'.format(taxon_id, assembly, date_str) + data = collect_experiments(assembly, taxon_id) + output_file(data, filename) diff --git a/IHEC_json_converter/general.py b/IHEC_json_converter/general.py index be983f7..66e1b56 100755 --- a/IHEC_json_converter/general.py +++ b/IHEC_json_converter/general.py @@ -1,12 +1,17 @@ -f__author__ = 'kelley' +from __future__ import print_function +f__author__ = 'kelley' import sys -import urlparse +from urllib.parse import urlparse import requests import json import os from datetime import datetime +from rnaseq import * +from bisulfite import * +from chipseq import * + ''' This file contains methods that convert ENCODE data (from its webservice urls) into JSON that IHEC then loads. @@ -59,27 +64,11 @@ # This code is used to generate IHEC json objects from any type of ENCODE assay. Anything assay-specific belongs # in other files. -def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_specific_additions, limit='all'): - #Prepare the url - make sure it has the right parameters. - url = prep_query_url(url, assembly, taxon_id, limit=limit) - - #Get json response - json_response = get_json_from_encode(url) - - if len(json_response) == 0: - raise Exception('This url returns no data.') - +def convert_to_IHEC_format(datasets, assembly, taxon_id): + #Create hub description hub_description = create_hub_description(assembly, taxon_id) - #Create all of the datasets (one for each experiment, sample_id pair) - print '%d experiments returned from this query.' % len(json_response) - - datasets = dict() - for entry in json_response: - datasets.update(create_datasets(entry, assay_specific_additions)) - - print '%d IHEC datasets created.' % len(datasets) # Merge datasets. We can merge two datasets if either # 1. they have the same experiment_id and their sample_attributes are identical @@ -94,13 +83,14 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci experiment_id_to_datasets[experiment_id].append(dataset) datasets = dict() - for experiment_id, these_datasets in experiment_id_to_datasets.iteritems(): + for experiment_id, these_datasets in experiment_id_to_datasets.items(): if is_match_sample_attributes([x['sample_attributes'] for x in these_datasets]): - print 'Match found: ' + experiment_id + print('Match found: ' + experiment_id) dataset = these_datasets[0] dataset['browser'] = merge_tracks(these_datasets) dataset['sample_attributes']['sample_id'] = '-'.join(sorted([x['sample_attributes']['sample_id'] for x in these_datasets])) + dataset['analysis_attributes'] = create_analysis_attributes(dataset) datasets[experiment_id] = dataset else: for dataset in these_datasets: @@ -109,7 +99,7 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci # Merge 2 group_key_to_datasets = dict() nogroup_datasets = dict() - for accession, dataset in datasets.iteritems(): + for accession, dataset in datasets.items(): if dataset['award'] == 'ENCODE3': key = (dataset['experiment_attributes']['experiment_type'], dataset['sample_attributes']['sample_id']) if key not in group_key_to_datasets: @@ -119,9 +109,9 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci nogroup_datasets[accession] = dataset datasets = dict(nogroup_datasets) - for key, these_datasets in group_key_to_datasets.iteritems(): + for key, these_datasets in group_key_to_datasets.items(): if len(these_datasets) > 1: - print 'Collapsed: ', key, len(these_datasets) + print('Collapsed: ', key, len(these_datasets)) dataset = these_datasets[0][1] dataset['browser'] = collapse_tracks([x[1] for x in these_datasets]) @@ -130,31 +120,66 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci datasets[these_datasets[0][0]] = these_datasets[0][1] #Set is_main on tracks - for accession, dataset in datasets.iteritems(): + for accession, dataset in datasets.items(): for track_type in set(signal_mapping.values()): if track_type in dataset['browser']: - set_main_track(dataset['browser'][track_type], track_hierarchy[track_type], accession, track_type) + track_hierarchy = determine_track(dataset) + if track_hierarchy: + set_main_track(dataset['browser'][track_type], track_hierarchy[track_type], accession, track_type) # Remove extra entries: - for accession, dataset in datasets.iteritems(): + + samples = dict() + + for accession, dataset in datasets.items(): del dataset['award'] del dataset['accession'] del dataset['sample_attributes']['replicate'] + dataset['sample_id'] = dataset['sample_attributes']['sample_id'] + samples[dataset['sample_id']] = dataset['sample_attributes'] + del dataset['sample_attributes'] - print '%d IHEC datasets created after merge.' % len(datasets) + print('%d IHEC datasets created after merge.' % len(datasets)) return { 'hub_description': hub_description, - 'datasets': datasets + 'datasets': datasets, + 'samples': samples } +def determine_addition(experiment_obj): + assay_term_name = experiment_obj['assay_term_name'] + if assay_term_name == 'ChIP-seq': + return chip_seq_addition + elif assay_term_name == 'RNA-seq': + return rna_seq_addition + elif assay_term_name == 'whole-genome shotgun bisulfite sequencing': + return bisulfite_addition + + +def determine_track(dataset): + assay_term_name = dataset['experiment_attributes']['assay_type'] + if assay_term_name == 'ChIP-seq': + return CHIPSEQ_TRACK_HIEARCHY + elif assay_term_name == 'RNA-seq': + return RNASEQ_TRACK_HIEARCHY + elif assay_term_name == 'whole-genome shotgun bisulfite sequencing': + return BISULFATE_TRACK_HIEARCHY + +def get_epirr_id(item): + for registry_id in item['dbxrefs']: + if 'IHEC' in registry_id: + return registry_id + return None + + def merge_tracks(datasets): # We want to merge these datasets together, so we need to merge tracks. new_tracks = {} for dataset in datasets: - for track_type, tracks in dataset['browser'].iteritems(): + for track_type, tracks in dataset['browser'].items(): if track_type not in new_tracks: new_tracks[track_type] = [] @@ -164,14 +189,14 @@ def merge_tracks(datasets): #If a track has been assigned to multiple replicates, we might end up seeing it twice - check for that cleaned_tracks = dict() - for track_type, tracks in new_tracks.iteritems(): + for track_type, tracks in new_tracks.items(): url_to_track = dict() for track in tracks: if track['big_data_url'] not in url_to_track: url_to_track[track['big_data_url']] = [] url_to_track[track['big_data_url']].append(track) - for url, url_tracks in url_to_track.iteritems(): + for url, url_tracks in url_to_track.items(): if len(url_tracks) > 1: track = url_tracks[0] track['subtype'] = track['subtype'][0:track['subtype'].index('(rep')-1] @@ -183,7 +208,7 @@ def collapse_tracks(datasets): # We want to collapse these datasets together, so we need to collapse tracks. new_tracks = {} for dataset in datasets: - for track_type, tracks in dataset['browser'].iteritems(): + for track_type, tracks in dataset['browser'].items(): if track_type not in new_tracks: new_tracks[track_type] = [] new_tracks[track_type].extend(tracks) @@ -199,15 +224,13 @@ def create_hub_description(assembly, taxon_id): } -def create_datasets(experiment, assay_specific_additions): - print experiment['accession'] - experiment = get_json_from_encode('https://www.encodeproject.org/experiments/%s/?format=json' % experiment['accession']) +def create_datasets(experiment, registry_id, assay_specific_additions): #Create sample_attributes sample_attributes = [create_sample_attribute(replicate) for replicate in experiment['replicates']] #Create experiment_attributes - experiment_attributes = create_experiment_attributes(experiment) + experiment_attributes = create_experiment_attributes(experiment, registry_id) #Create tracks datasets = dict() @@ -266,7 +289,7 @@ def set_main_track(tracks, track_hierarchy, exp_accession, track_type): #Set main track if main_track is None: - print 'Dataset %s has no %s tracks.' % (exp_accession, track_type) + print('Dataset %s has no %s tracks.' % (exp_accession, track_type)) else: main_track['is_main'] = True return tracks @@ -286,7 +309,7 @@ def create_track(file, protocol_document_href, replicate_num): "subtype": file['output_type'] } else: - print 'Output type not found: ' + file['output_type'] + print('Output type not found: ' + file['output_type']) def is_match_sample_attributes(sample_attributes): @@ -311,7 +334,7 @@ def create_sample_attribute(replicate): elif biosample['biosample_term_id'].startswith('NTR'): sample_ontology_uri = '' else: - print 'Could not find url for sample ontology %s' % biosample['biosample_term_id'] + print('Could not find url for sample ontology %s' % biosample['biosample_term_id']) sample_attribute = { 'sample_id' : biosample['accession'], @@ -346,7 +369,7 @@ def add_SA_with_donor(sample_attribute, donor): def add_SA_cell_line(sample_attribute, biosample): if sample_attribute['biomaterial_type'] == 'Cell Line': sample_attribute['line'] = biosample['biosample_term_name'] - sample_attribute['differenciate_stage'] = biosample['biosample_term_name'] + sample_attribute['differentiation_stage'] = biosample['biosample_term_name'] sample_attribute['sex'] = biosample['sex'].title() #Get lineage @@ -357,7 +380,7 @@ def add_SA_cell_line(sample_attribute, biosample): if len(biosamples_associated_with_donor) > 0: sample_attribute['lineage'] = biosamples_associated_with_donor[0]['biosample_term_name'] else: - print 'No stem cell biosamples associated with donor %s' % donor_accession + print('No stem cell biosamples associated with donor %s' % donor_accession) else: sample_attribute['lineage'] = '' @@ -369,15 +392,27 @@ def add_SA_primary_cell(sample_attribute, biosample): def add_SA_primary_tissue(sample_attribute, biosample): if sample_attribute['biomaterial_type'] == 'Primary Tissue': - sample_attribute['tissue_type'] = biosample['biosample_term_id'] + sample_attribute['tissue_type'] = biosample['biosample_term_name'] -def create_experiment_attributes(experiment): +def create_experiment_attributes(experiment, registry_id): return { + "reference_registry_id": registry_id, "assay_type": experiment['assay_term_name'], "experiment_ontology_uri": 'http://purl.obolibrary.org/obo/%s' % experiment['assay_term_id'] } + +def create_analysis_attributes(experiment): + # No mapping exists between the expected attributes and portal data for now + analysis_attributes = { + "analysis_group": "", + "alignment_software": "", + "alignment_software_version": "", + "analysis_software": "...", + "analysis_software_version": "" + } + return analysis_attributes ############################# Get ENCODE data ############################# # These functions get ENCODE data from the webservices. @@ -389,9 +424,9 @@ def prep_query_url(url, assembly, taxon_id, limit='all'): queries['format'] = ['json'] queries['assembly'] = [assembly] queries['replicates.library.biosample.donor.organism.taxon_id'] = [str(taxon_id)] - query = '&'.join(['&'.join([key + '=' + value for value in values]) for key, values in queries.iteritems()]) + query = '&'.join(['&'.join([key + '=' + value for value in values]) for key, values in queries.items()]) new_url = urlparse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, query, parsed.fragment)) - print new_url + print(new_url) return new_url diff --git a/IHEC_json_converter/reference_epigenome_experiments.py b/IHEC_json_converter/reference_epigenome_experiments.py new file mode 100644 index 0000000..8e94359 --- /dev/null +++ b/IHEC_json_converter/reference_epigenome_experiments.py @@ -0,0 +1,30 @@ +import requests +import json +from general import * +from rnaseq import * +from bisulfite import * +from chipseq import * + + + +API_ENDPOINT = "https://www.encodeproject.org" +REFERENCE_EPIGENOME_COLLECTION = API_ENDPOINT + "/search/?type=ReferenceEpigenome&format=json&limit=all" + +def collect_experiments(assembly='hg19', taxon_id=9606): + dataset = dict() + response = requests.get(REFERENCE_EPIGENOME_COLLECTION) + + # Iterate over Reference Epigenomes + for reference_epigenome in response.json()['@graph']: + reference_epigenome_object = requests.get(API_ENDPOINT + reference_epigenome['@id']+'?format=json').json() + reference_registry_id = get_epirr_id(reference_epigenome_object) + + # Iterate over experiments in Reference Epigenomes + for experiment_obj in reference_epigenome_object['related_datasets']: + if 'assembly' in experiment_obj and experiment_obj['assembly'] and assembly in experiment_obj['assembly']: + addition = determine_addition(experiment_obj) + if addition: + dataset.update(create_datasets(experiment_obj, reference_registry_id, addition)) + + data = convert_to_IHEC_format(dataset, assembly, taxon_id) + return data \ No newline at end of file diff --git a/IHEC_json_converter/rnaseq.py b/IHEC_json_converter/rnaseq.py index fe7f964..45f4acc 100755 --- a/IHEC_json_converter/rnaseq.py +++ b/IHEC_json_converter/rnaseq.py @@ -1,52 +1,46 @@ __author__ = 'kelley' import json -from general import convert_to_IHEC_format, set_main_track, signal_mapping VERSION='1.6' -def rna_seq_wrapper(assembly, taxon_id): - url = 'https://www.encodeproject.org/search/?type=experiment&assay_term_name=RNA-seq' - # Used to set is_main - track_hierarchy = {'signal_forward': ['plus strand signal of unique reads', 'plus strand signal of all reads', +RNASEQ_TRACK_HIEARCHY = {'signal_forward': ['plus strand signal of unique reads', 'plus strand signal of all reads', 'plus strand signal', 'raw plus strand signal'], 'signal_reverse': ['minus strand signal of unique reads', 'minus strand signal of all reads', 'minus strand signal', 'raw minus strand signal'], 'signal': ['signal of unique reads', 'signal of all reads', 'signal', 'raw signal', 'splice junctions'], 'contigs': ['contigs']} + +def rna_seq_addition(experiment, json_object): + size_range_to_experiment_type = { '>200': 'mRNA-seq', '<200': 'smRNA-seq' } - def dataset_additions_f(experiment, json_object): - - #Set experiment_type - size_range = None - if 'size_range' in experiment['replicates'][0]['library']: - size_range = experiment['replicates'][0]['library']['size_range'] - - if size_range is None: - print 'Could not find size_range ' + experiment['accession'] - json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' - json_object['experiment_attributes']['assay_type'] = 'RNA-seq' - elif size_range not in size_range_to_experiment_type: - print 'Size range not found: ' + experiment['replicates'][0]['library']['size_range'] - json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' - json_object['experiment_attributes']['assay_type'] = 'RNA-seq' - else: - json_object['experiment_attributes']['experiment_type'] = size_range_to_experiment_type[size_range] - json_object['experiment_attributes']['assay_type'] = size_range_to_experiment_type[size_range] - - return json_object - - return convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, dataset_additions_f) - - - -if __name__ == "__main__": - data = rna_seq_wrapper(assembly='hg19', taxon_id=9606) - with open('../output/RNAseq_v%s.json' % VERSION, 'w+') as outfile: - json.dump(data, outfile, indent=4) \ No newline at end of file + #Set experiment_type + size_range = None + if 'size_range' in experiment['replicates'][0]['library']: + size_range = experiment['replicates'][0]['library']['size_range'] + + if size_range is None: + print('Could not find size_range ' + experiment['accession']) + json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' + json_object['experiment_attributes']['assay_type'] = 'RNA-seq' + elif size_range not in size_range_to_experiment_type: + print('Size range not found: ' + experiment['replicates'][0]['library']['size_range']) + json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' + json_object['experiment_attributes']['assay_type'] = 'RNA-seq' + else: + json_object['experiment_attributes']['experiment_type'] = size_range_to_experiment_type[size_range] + json_object['experiment_attributes']['assay_type'] = size_range_to_experiment_type[size_range] + + return json_object + + +# if __name__ == "__main__": +# data = rna_seq_wrapper(assembly='hg19', taxon_id=9606) +# with open('../output/RNAseq_v%s.json' % VERSION, 'w+') as outfile: +# json.dump(data, outfile, indent=4) \ No newline at end of file diff --git a/README.md b/README.md index 64c5b0d..b5bc220 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,15 @@ These scripts fetch ENCODE metadata from the ENCODE portal, and output it in the #### Parameters -* assembly: Assembly name (e.g. hg19, hg38, mm10) -* taxon-id: Species taxonomy id (e.g. 9606 for human) +* assembly: Assembly name (e.g. hg19, GRCh38, mm10) +* taxon-id: Species taxonomy id (e.g. 9606 for human, 10090 for mouse) #### Example ``` cd IHEC_json_converter -python ./fetch_all_exp_jsons.py --assembly=hg19 --taxon-id=9606 +python3 ./fetch_all_exp_jsons.py --assembly=hg19 --taxon-id=9606 ``` + +### Credits + +Original development by ENCODE team, available here: https://github.com/kpaskov/PaskovEncodeScripts \ No newline at end of file diff --git a/output/README.md b/output/README.md new file mode 100644 index 0000000..5d0bba6 --- /dev/null +++ b/output/README.md @@ -0,0 +1 @@ +Placeholder for scripts output. \ No newline at end of file