From 3080c61cd17f722cdaac7295883dcbf29ef9161f Mon Sep 17 00:00:00 2001 From: dbujold Date: Fri, 19 Feb 2016 09:29:47 -0500 Subject: [PATCH 01/12] Fixed some wrappers assembly and taxon-id parameters. --- IHEC_json_converter/fetch_all_exp_jsons.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index e646b8f..dfde7d6 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -27,7 +27,7 @@ def main(argv): #Whole-Genome Bisulfite Sequencing experiments print("Processing WGB-Seq...") try: - data = bisulfite.bisulfite_wrapper(assembly='hg19', taxon_id=9606) + data = bisulfite.bisulfite_wrapper(assembly=assembly, taxon_id=taxon_id) filename = 'WGB-Seq_%s_%s_%s.json' % (taxon_id, assembly, date_str) output_file(data, filename) print("Done.") @@ -51,7 +51,7 @@ def main(argv): for t in targets: print("Processing ChIP-Seq %s..." % t) try: - data = chipseq.chip_seq_wrapper(assembly='hg19', taxon_id=9606, target=t) + data = chipseq.chip_seq_wrapper(assembly=assembly, taxon_id=taxon_id, target=t) filename = 'ChIP-Seq_%s_%s_%s_%s.json' % (taxon_id, assembly, t, date_str) output_file(data, filename) print("Done.") From 55c65f7706a346a5cc8bcb2fa62ce3d8b9ad1990 Mon Sep 17 00:00:00 2001 From: dbujold Date: Fri, 19 Feb 2016 09:50:35 -0500 Subject: [PATCH 02/12] Added original credits. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 64c5b0d..c864026 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,7 @@ These scripts fetch ENCODE metadata from the ENCODE portal, and output it in the cd IHEC_json_converter python ./fetch_all_exp_jsons.py --assembly=hg19 --taxon-id=9606 ``` + +### Credits + +Original development by ENCODE team, available here: https://github.com/kpaskov/PaskovEncodeScripts \ No newline at end of file From a1e0a467bb70fa6e582ccedecafd1fcdf940c5f2 Mon Sep 17 00:00:00 2001 From: dbujold Date: Tue, 19 Apr 2016 15:09:21 -0400 Subject: [PATCH 03/12] Slightly improved execution logging. --- .gitignore | 2 +- IHEC_json_converter/general.py | 28 ++++++++++++++++------------ output/README.md | 1 + 3 files changed, 18 insertions(+), 13 deletions(-) create mode 100644 output/README.md diff --git a/.gitignore b/.gitignore index f52b0de..3d94281 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ .idea #Scripts output directory content -output/* +output/*.json diff --git a/IHEC_json_converter/general.py b/IHEC_json_converter/general.py index be983f7..39044fe 100755 --- a/IHEC_json_converter/general.py +++ b/IHEC_json_converter/general.py @@ -1,5 +1,6 @@ -f__author__ = 'kelley' +from __future__ import print_function +f__author__ = 'kelley' import sys import urlparse @@ -73,13 +74,17 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci hub_description = create_hub_description(assembly, taxon_id) #Create all of the datasets (one for each experiment, sample_id pair) - print '%d experiments returned from this query.' % len(json_response) + print('%d experiments returned from this query.' % len(json_response)) datasets = dict() + datasetIdx = 0 for entry in json_response: + datasetIdx += 1 + print('%03d / %03d: \'%s\' ' % (datasetIdx, len(json_response), entry['accession']), end='') datasets.update(create_datasets(entry, assay_specific_additions)) + print() - print '%d IHEC datasets created.' % len(datasets) + print('%d IHEC datasets created.' % len(datasets)) # Merge datasets. We can merge two datasets if either # 1. they have the same experiment_id and their sample_attributes are identical @@ -96,7 +101,7 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci datasets = dict() for experiment_id, these_datasets in experiment_id_to_datasets.iteritems(): if is_match_sample_attributes([x['sample_attributes'] for x in these_datasets]): - print 'Match found: ' + experiment_id + print('Match found: ' + experiment_id) dataset = these_datasets[0] dataset['browser'] = merge_tracks(these_datasets) @@ -121,7 +126,7 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci datasets = dict(nogroup_datasets) for key, these_datasets in group_key_to_datasets.iteritems(): if len(these_datasets) > 1: - print 'Collapsed: ', key, len(these_datasets) + print('Collapsed: ', key, len(these_datasets)) dataset = these_datasets[0][1] dataset['browser'] = collapse_tracks([x[1] for x in these_datasets]) @@ -142,7 +147,7 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci del dataset['accession'] del dataset['sample_attributes']['replicate'] - print '%d IHEC datasets created after merge.' % len(datasets) + print('%d IHEC datasets created after merge.' % len(datasets)) return { 'hub_description': hub_description, @@ -200,7 +205,6 @@ def create_hub_description(assembly, taxon_id): def create_datasets(experiment, assay_specific_additions): - print experiment['accession'] experiment = get_json_from_encode('https://www.encodeproject.org/experiments/%s/?format=json' % experiment['accession']) #Create sample_attributes @@ -266,7 +270,7 @@ def set_main_track(tracks, track_hierarchy, exp_accession, track_type): #Set main track if main_track is None: - print 'Dataset %s has no %s tracks.' % (exp_accession, track_type) + print('Dataset %s has no %s tracks.' % (exp_accession, track_type)) else: main_track['is_main'] = True return tracks @@ -286,7 +290,7 @@ def create_track(file, protocol_document_href, replicate_num): "subtype": file['output_type'] } else: - print 'Output type not found: ' + file['output_type'] + print('Output type not found: ' + file['output_type']) def is_match_sample_attributes(sample_attributes): @@ -311,7 +315,7 @@ def create_sample_attribute(replicate): elif biosample['biosample_term_id'].startswith('NTR'): sample_ontology_uri = '' else: - print 'Could not find url for sample ontology %s' % biosample['biosample_term_id'] + print('Could not find url for sample ontology %s' % biosample['biosample_term_id']) sample_attribute = { 'sample_id' : biosample['accession'], @@ -357,7 +361,7 @@ def add_SA_cell_line(sample_attribute, biosample): if len(biosamples_associated_with_donor) > 0: sample_attribute['lineage'] = biosamples_associated_with_donor[0]['biosample_term_name'] else: - print 'No stem cell biosamples associated with donor %s' % donor_accession + print('No stem cell biosamples associated with donor %s' % donor_accession) else: sample_attribute['lineage'] = '' @@ -391,7 +395,7 @@ def prep_query_url(url, assembly, taxon_id, limit='all'): queries['replicates.library.biosample.donor.organism.taxon_id'] = [str(taxon_id)] query = '&'.join(['&'.join([key + '=' + value for value in values]) for key, values in queries.iteritems()]) new_url = urlparse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, query, parsed.fragment)) - print new_url + print(new_url) return new_url diff --git a/output/README.md b/output/README.md new file mode 100644 index 0000000..5d0bba6 --- /dev/null +++ b/output/README.md @@ -0,0 +1 @@ +Placeholder for scripts output. \ No newline at end of file From fcbf7c90edcb75cf938ca9a57f7fd4df1755136c Mon Sep 17 00:00:00 2001 From: dbujold Date: Tue, 19 Apr 2016 17:06:45 -0400 Subject: [PATCH 04/12] Modified output filenames. --- IHEC_json_converter/fetch_all_exp_jsons.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index dfde7d6..e4c7f4f 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -28,7 +28,7 @@ def main(argv): print("Processing WGB-Seq...") try: data = bisulfite.bisulfite_wrapper(assembly=assembly, taxon_id=taxon_id) - filename = 'WGB-Seq_%s_%s_%s.json' % (taxon_id, assembly, date_str) + filename = 'ENCODE.%s.%s.WGB-Seq.%s.json' % (taxon_id, assembly, date_str) output_file(data, filename) print("Done.") except Exception as e: @@ -39,11 +39,11 @@ def main(argv): print("Processing RNA-Seq...") try: data = rnaseq.rna_seq_wrapper(assembly=assembly, taxon_id=taxon_id) - filename = 'RNA-Seq_%s_%s_%s.json' % (taxon_id, assembly, date_str) + filename = 'ENCODE.%s.%s.RNA-Seq.%s.json' % (taxon_id, assembly, date_str) output_file(data, filename) print("Done.") except Exception as e: - print('An error occured while fetching RNA-Seq experiments: ' + e.message) + print('An error occured while fetching RNA-Seq experiments: ' + str(e.message)) print #ChIP-Seq experiments @@ -52,7 +52,7 @@ def main(argv): print("Processing ChIP-Seq %s..." % t) try: data = chipseq.chip_seq_wrapper(assembly=assembly, taxon_id=taxon_id, target=t) - filename = 'ChIP-Seq_%s_%s_%s_%s.json' % (taxon_id, assembly, t, date_str) + filename = 'ENCODE.%s.%s.ChIP-Seq_%s.%s.json' % (taxon_id, assembly, t, date_str) output_file(data, filename) print("Done.") except Exception as e: From 9f74ae6a850dd9d0e0d550998396ce73a92fe817 Mon Sep 17 00:00:00 2001 From: dbujold Date: Fri, 22 Apr 2016 09:13:01 -0400 Subject: [PATCH 05/12] Fixed differentiation_stage property typo, and tissue_type property source in ENCODE api. --- IHEC_json_converter/general.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/IHEC_json_converter/general.py b/IHEC_json_converter/general.py index 39044fe..87d6978 100755 --- a/IHEC_json_converter/general.py +++ b/IHEC_json_converter/general.py @@ -350,7 +350,7 @@ def add_SA_with_donor(sample_attribute, donor): def add_SA_cell_line(sample_attribute, biosample): if sample_attribute['biomaterial_type'] == 'Cell Line': sample_attribute['line'] = biosample['biosample_term_name'] - sample_attribute['differenciate_stage'] = biosample['biosample_term_name'] + sample_attribute['differentiation_stage'] = biosample['biosample_term_name'] sample_attribute['sex'] = biosample['sex'].title() #Get lineage @@ -373,7 +373,7 @@ def add_SA_primary_cell(sample_attribute, biosample): def add_SA_primary_tissue(sample_attribute, biosample): if sample_attribute['biomaterial_type'] == 'Primary Tissue': - sample_attribute['tissue_type'] = biosample['biosample_term_id'] + sample_attribute['tissue_type'] = biosample['biosample_term_name'] def create_experiment_attributes(experiment): From 19ff6e6f86f85bd71aafffe5b8ef655cfc9fa9fd Mon Sep 17 00:00:00 2001 From: dbujold Date: Wed, 31 Aug 2016 16:31:41 -0400 Subject: [PATCH 06/12] Fixed hg38 download instructions example. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c864026..1e5e903 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ These scripts fetch ENCODE metadata from the ENCODE portal, and output it in the #### Parameters -* assembly: Assembly name (e.g. hg19, hg38, mm10) -* taxon-id: Species taxonomy id (e.g. 9606 for human) +* assembly: Assembly name (e.g. hg19, GRCh38, mm10) +* taxon-id: Species taxonomy id (e.g. 9606 for human, 10090 for mouse) #### Example ``` From 68f31622d6d058f239fb5c4ed32b839980ea64e5 Mon Sep 17 00:00:00 2001 From: Ulugbek Baymuradov Date: Wed, 26 Oct 2016 11:20:14 -0700 Subject: [PATCH 07/12] ignore pyc files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3d94281..950912c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ #PyCharm workdir .idea - +*.pyc #Scripts output directory content output/*.json From 8fe401f89fd1682333e50348c4a78f5eb98ad321 Mon Sep 17 00:00:00 2001 From: Ulugbek Baymuradov Date: Thu, 27 Oct 2016 14:40:15 -0700 Subject: [PATCH 08/12] initial run --- IHEC_json_converter/bisulfite.py | 30 ++++----- IHEC_json_converter/chipseq.py | 30 ++++----- IHEC_json_converter/fetch_all_exp_jsons.py | 42 +++++++------ IHEC_json_converter/general.py | 73 +++++++++++++++------- IHEC_json_converter/rnaseq.py | 61 +++++++++--------- 5 files changed, 124 insertions(+), 112 deletions(-) diff --git a/IHEC_json_converter/bisulfite.py b/IHEC_json_converter/bisulfite.py index 6997f13..d58c8e4 100755 --- a/IHEC_json_converter/bisulfite.py +++ b/IHEC_json_converter/bisulfite.py @@ -1,29 +1,21 @@ __author__ = 'kelley' -import json -from general import convert_to_IHEC_format VERSION='1.6' -def bisulfite_wrapper(assembly, taxon_id): - url = 'https://www.encodeproject.org/search/?type=experiment&assay_term_name=whole-genome%20shotgun%20bisulfite%20sequencing' +# Used to set is_main +BISULFATE_TRACK_HIEARCHY = {'methylation_profile': ['methylation state at CpG', 'methylation state at CHH']} - # Used to set is_main - track_hierarchy = {'methylation_profile': ['methylation state at CpG', 'methylation state at CHH']} - def dataset_additions_f(experiment, json_object): +def bisulfate_addition(experiment, json_object): + #Set experiment_type + json_object['experiment_attributes']['experiment_type'] = 'DNA Methylation' + json_object['experiment_attributes']['assay_type'] = 'WGB-Seq' - #Set experiment_type - json_object['experiment_attributes']['experiment_type'] = 'DNA Methylation' - json_object['experiment_attributes']['assay_type'] = 'WGB-Seq' + return json_object - return json_object - return convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, dataset_additions_f) - - - -if __name__ == "__main__": - data = bisulfite_wrapper(assembly='hg19', taxon_id=9606) - with open('../output/bisulfite_v%s.json' % VERSION, 'w+') as outfile: - json.dump(data, outfile, indent=4) \ No newline at end of file +# if __name__ == "__main__": +# data = bisulfite_wrapper(assembly='hg19', taxon_id=9606) +# with open('../output/bisulfite_v%s.json' % VERSION, 'w+') as outfile: +# json.dump(data, outfile, indent=4) \ No newline at end of file diff --git a/IHEC_json_converter/chipseq.py b/IHEC_json_converter/chipseq.py index 65b9ce5..b624cd9 100755 --- a/IHEC_json_converter/chipseq.py +++ b/IHEC_json_converter/chipseq.py @@ -1,34 +1,26 @@ __author__ = 'kelley' -import json -from general import convert_to_IHEC_format VERSION='1.6' -def chip_seq_wrapper(assembly, taxon_id, target): - url = 'https://www.encodeproject.org/search/?type=experiment&assay_term_name=ChIP-seq&target.name=%s-human' % target - - # Used to set is_main - track_hierarchy = {'peak_calls': ['optimal idr thresholded peaks', 'conservative idr thresholded peaks', +CHIPSEQ_TRACK_HIEARCHY = {'peak_calls': ['optimal idr thresholded peaks', 'conservative idr thresholded peaks', 'replicated peaks', 'peaks', 'hotspots'], 'signal': ['signal p-value', 'fold change over control', 'signal', 'raw signal']} - def dataset_additions_f(experiment, json_object): - - #Set experiment_type - json_object['experiment_attributes']['experiment_type'] = experiment['target']['label'] - return json_object +def chip_seq_addition(experiment, json_object): - return convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, dataset_additions_f) + #Set experiment_type + json_object['experiment_attributes']['experiment_type'] = experiment['target']['label'] + return json_object -if __name__ == "__main__": - targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] - for t in targets: - data = chip_seq_wrapper(assembly='hg19', taxon_id=9606, target=t) - with open('../output/%s_v%s.json' % (t, VERSION), 'w+') as outfile: - json.dump(data, outfile, indent=4) +# if __name__ == "__main__": +# targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] +# for t in targets: +# data = chip_seq_wrapper(assembly='hg19', taxon_id=9606, target=t) +# with open('../output/%s_v%s.json' % (t, VERSION), 'w+') as outfile: +# json.dump(data, outfile, indent=4) diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index e4c7f4f..d6d5d94 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -25,29 +25,31 @@ def main(argv): #Todo: Merge experiments as a single JSON #Whole-Genome Bisulfite Sequencing experiments - print("Processing WGB-Seq...") - try: - data = bisulfite.bisulfite_wrapper(assembly=assembly, taxon_id=taxon_id) - filename = 'ENCODE.%s.%s.WGB-Seq.%s.json' % (taxon_id, assembly, date_str) - output_file(data, filename) - print("Done.") - except Exception as e: - print('An error occured while fetching WGB-Seq experiments: ' + e.message) - print + # print("Processing WGB-Seq...") + # try: + # data = bisulfite.bisulfite_wrapper(assembly=assembly, taxon_id=taxon_id) + # filename = 'ENCODE.%s.%s.WGB-Seq.%s.json' % (taxon_id, assembly, date_str) + # output_file(data, filename) + # print("Done.") + # except Exception as e: + # print('An error occured while fetching WGB-Seq experiments: ' + e.message) + # print - #RNA-Sequencing experiments - print("Processing RNA-Seq...") - try: - data = rnaseq.rna_seq_wrapper(assembly=assembly, taxon_id=taxon_id) - filename = 'ENCODE.%s.%s.RNA-Seq.%s.json' % (taxon_id, assembly, date_str) - output_file(data, filename) - print("Done.") - except Exception as e: - print('An error occured while fetching RNA-Seq experiments: ' + str(e.message)) - print + # #RNA-Sequencing experiments + # print("Processing RNA-Seq...") + # try: + # data = rnaseq.rna_seq_wrapper(assembly=assembly, taxon_id=taxon_id) + # filename = 'ENCODE.%s.%s.RNA-Seq.%s.json' % (taxon_id, assembly, date_str) + # output_file(data, filename) + # print("Done.") + # except Exception as e: + # print('An error occured while fetching RNA-Seq experiments: ' + str(e.message)) + # print #ChIP-Seq experiments - targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] + # targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] + targets = ['H3K27ac'] + for t in targets: print("Processing ChIP-Seq %s..." % t) try: diff --git a/IHEC_json_converter/general.py b/IHEC_json_converter/general.py index 87d6978..7255591 100755 --- a/IHEC_json_converter/general.py +++ b/IHEC_json_converter/general.py @@ -8,6 +8,10 @@ import json import os from datetime import datetime +from rnaseq import * +from bisulfite import * +from chipseq import * + ''' This file contains methods that convert ENCODE data (from its webservice urls) into JSON that IHEC then loads. @@ -60,31 +64,31 @@ # This code is used to generate IHEC json objects from any type of ENCODE assay. Anything assay-specific belongs # in other files. -def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_specific_additions, limit='all'): - #Prepare the url - make sure it has the right parameters. - url = prep_query_url(url, assembly, taxon_id, limit=limit) +def convert_to_IHEC_format(datasets, assembly, taxon_id): + # #Prepare the url - make sure it has the right parameters. + # url = prep_query_url(url, assembly, taxon_id, limit=limit) - #Get json response - json_response = get_json_from_encode(url) + # #Get json response + # json_response = get_json_from_encode(url) - if len(json_response) == 0: - raise Exception('This url returns no data.') + # if len(json_response) == 0: + # raise Exception('This url returns no data.') - #Create hub description - hub_description = create_hub_description(assembly, taxon_id) + # #Create hub description + # hub_description = create_hub_description(assembly, taxon_id) - #Create all of the datasets (one for each experiment, sample_id pair) - print('%d experiments returned from this query.' % len(json_response)) + # #Create all of the datasets (one for each experiment, sample_id pair) + # print('%d experiments returned from this query.' % len(json_response)) - datasets = dict() - datasetIdx = 0 - for entry in json_response: - datasetIdx += 1 - print('%03d / %03d: \'%s\' ' % (datasetIdx, len(json_response), entry['accession']), end='') - datasets.update(create_datasets(entry, assay_specific_additions)) - print() + # datasets = dict() + # datasetIdx = 0 + # for entry in json_response: + # datasetIdx += 1 + # print('%03d / %03d: \'%s\' ' % (datasetIdx, len(json_response), entry['accession']), end='') + # datasets.update(create_datasets(entry, assay_specific_additions)) + # print() - print('%d IHEC datasets created.' % len(datasets)) + # print('%d IHEC datasets created.' % len(datasets)) # Merge datasets. We can merge two datasets if either # 1. they have the same experiment_id and their sample_attributes are identical @@ -138,6 +142,7 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci for accession, dataset in datasets.iteritems(): for track_type in set(signal_mapping.values()): if track_type in dataset['browser']: + track_hierarchy = determine_track(dataset) set_main_track(dataset['browser'][track_type], track_hierarchy[track_type], accession, track_type) @@ -155,6 +160,32 @@ def convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, assay_speci } +def determine_addition(experiment_obj): + assay_term_name = experiment_obj['assay_term_name'] + if assay_term_name == 'ChIP-seq': + return chip_seq_addition + elif assay_term_name == 'RNA-seq': + return rna_seq_addition + elif assay_term_name == 'whole-genome shotgun bisulfite sequencing': + return bisulfite_addition + + +def determine_track(dataset): + assay_term_name = dataset['experiment_attributes']['assay_type'] + if assay_term_name == 'ChIP-seq': + return CHIPSEQ_TRACK_HIEARCHY + elif assay_term_name == 'RNA-seq': + return RNASEQ_TRACK_HIEARCHY + elif assay_term_name == 'whole-genome shotgun bisulfite sequencing': + return BISULFATE_TRACK_HIEARCHY + +def get_epirr_id(item): + for registry_id in item['dbxref']: + if 'IHEC' in registry_id: + return registry_id + return None + + def merge_tracks(datasets): # We want to merge these datasets together, so we need to merge tracks. new_tracks = {} @@ -204,8 +235,7 @@ def create_hub_description(assembly, taxon_id): } -def create_datasets(experiment, assay_specific_additions): - experiment = get_json_from_encode('https://www.encodeproject.org/experiments/%s/?format=json' % experiment['accession']) +def create_datasets(experiment, registry_id, assay_specific_additions): #Create sample_attributes sample_attributes = [create_sample_attribute(replicate) for replicate in experiment['replicates']] @@ -235,6 +265,7 @@ def create_datasets(experiment, assay_specific_additions): all_tracks[track_type] = these_tracks json_object = { + 'reference_registry_id': registry_id, 'sample_attributes': sample_attribute, 'experiment_attributes': experiment_attributes, 'browser': all_tracks, diff --git a/IHEC_json_converter/rnaseq.py b/IHEC_json_converter/rnaseq.py index fe7f964..788e817 100755 --- a/IHEC_json_converter/rnaseq.py +++ b/IHEC_json_converter/rnaseq.py @@ -5,48 +5,43 @@ VERSION='1.6' -def rna_seq_wrapper(assembly, taxon_id): - url = 'https://www.encodeproject.org/search/?type=experiment&assay_term_name=RNA-seq' - # Used to set is_main - track_hierarchy = {'signal_forward': ['plus strand signal of unique reads', 'plus strand signal of all reads', +RNASEQ_TRACK_HIEARCHY = {'signal_forward': ['plus strand signal of unique reads', 'plus strand signal of all reads', 'plus strand signal', 'raw plus strand signal'], 'signal_reverse': ['minus strand signal of unique reads', 'minus strand signal of all reads', 'minus strand signal', 'raw minus strand signal'], 'signal': ['signal of unique reads', 'signal of all reads', 'signal', 'raw signal', 'splice junctions'], 'contigs': ['contigs']} + +def rna_seq_addition(experiment, json_object): + size_range_to_experiment_type = { '>200': 'mRNA-seq', '<200': 'smRNA-seq' } - def dataset_additions_f(experiment, json_object): - - #Set experiment_type - size_range = None - if 'size_range' in experiment['replicates'][0]['library']: - size_range = experiment['replicates'][0]['library']['size_range'] - - if size_range is None: - print 'Could not find size_range ' + experiment['accession'] - json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' - json_object['experiment_attributes']['assay_type'] = 'RNA-seq' - elif size_range not in size_range_to_experiment_type: - print 'Size range not found: ' + experiment['replicates'][0]['library']['size_range'] - json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' - json_object['experiment_attributes']['assay_type'] = 'RNA-seq' - else: - json_object['experiment_attributes']['experiment_type'] = size_range_to_experiment_type[size_range] - json_object['experiment_attributes']['assay_type'] = size_range_to_experiment_type[size_range] - - return json_object - - return convert_to_IHEC_format(url, assembly, taxon_id, track_hierarchy, dataset_additions_f) - - - -if __name__ == "__main__": - data = rna_seq_wrapper(assembly='hg19', taxon_id=9606) - with open('../output/RNAseq_v%s.json' % VERSION, 'w+') as outfile: - json.dump(data, outfile, indent=4) \ No newline at end of file + #Set experiment_type + size_range = None + if 'size_range' in experiment['replicates'][0]['library']: + size_range = experiment['replicates'][0]['library']['size_range'] + + if size_range is None: + print 'Could not find size_range ' + experiment['accession'] + json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' + json_object['experiment_attributes']['assay_type'] = 'RNA-seq' + elif size_range not in size_range_to_experiment_type: + print 'Size range not found: ' + experiment['replicates'][0]['library']['size_range'] + json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' + json_object['experiment_attributes']['assay_type'] = 'RNA-seq' + else: + json_object['experiment_attributes']['experiment_type'] = size_range_to_experiment_type[size_range] + json_object['experiment_attributes']['assay_type'] = size_range_to_experiment_type[size_range] + + return json_object + + +# if __name__ == "__main__": +# data = rna_seq_wrapper(assembly='hg19', taxon_id=9606) +# with open('../output/RNAseq_v%s.json' % VERSION, 'w+') as outfile: +# json.dump(data, outfile, indent=4) \ No newline at end of file From 3ab60bd9704cfc73997d50fe51bb45811b34f060 Mon Sep 17 00:00:00 2001 From: Ulugbek Baymuradov Date: Thu, 27 Oct 2016 14:40:32 -0700 Subject: [PATCH 09/12] added new main file --- .../reference_epigenome_experiments.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 IHEC_json_converter/reference_epigenome_experiments.py diff --git a/IHEC_json_converter/reference_epigenome_experiments.py b/IHEC_json_converter/reference_epigenome_experiments.py new file mode 100644 index 0000000..5610cca --- /dev/null +++ b/IHEC_json_converter/reference_epigenome_experiments.py @@ -0,0 +1,36 @@ +import requests +import json +from general import * +from rnaseq import * +from bisulfite import * +from chipseq import * + + + +API_ENDPOINT = "https://www.encodeproject.org" +REFERENCE_EPIGENOME_COLLECTION = API_ENDPOINT + "/search/?type=ReferenceEpigenome&format=json&limit=all" + +def run(): + hg19_dataset = dict() + response = requests.get(API_ENDPOINT) + + # Iterate over Reference Epigenomes + for reference_epigenome in response.json()['@graph']: + reference_epigenome_object = requests.get(reference_epigenome['@id']+'?format=json').json() + reference_registry_id = get_epirr_id(reference_epigenome_object) + + # Iterate over experiments in Reference Epigenomes + for experiment_obj in reference_epigenome_object: + addition = determine_addition(experiment_obj) + if experiment_obj['assembly'][0] == 'hg19': + hg19_dataset.update(create_datasets(experiment_obj, reference_registry_id, addition)) + + with open('../output/hg19.json', 'w+') as outfile: + json.dumps(hg19_dataset, outfile, indent=4) + + + + + +if __name__ == "__main__": + run() \ No newline at end of file From c358899d1597017e571b90f992f5b4e1ab845c12 Mon Sep 17 00:00:00 2001 From: Ulugbek Baymuradov Date: Thu, 27 Oct 2016 16:50:54 -0700 Subject: [PATCH 10/12] debug and clean up --- IHEC_json_converter/bisulfite.py | 2 +- IHEC_json_converter/fetch_all_exp_jsons.py | 39 ------------ IHEC_json_converter/general.py | 62 ++++++++----------- .../reference_epigenome_experiments.py | 37 +++++------ IHEC_json_converter/rnaseq.py | 5 +- 5 files changed, 45 insertions(+), 100 deletions(-) diff --git a/IHEC_json_converter/bisulfite.py b/IHEC_json_converter/bisulfite.py index d58c8e4..ee16540 100755 --- a/IHEC_json_converter/bisulfite.py +++ b/IHEC_json_converter/bisulfite.py @@ -4,7 +4,7 @@ VERSION='1.6' # Used to set is_main -BISULFATE_TRACK_HIEARCHY = {'methylation_profile': ['methylation state at CpG', 'methylation state at CHH']} +BISULFATE_TRACK_HIEARCHY = {'methylation_profile': ['methylation state at CpG', 'signal']} def bisulfate_addition(experiment, json_object): diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index d6d5d94..ffe0676 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -2,7 +2,6 @@ import getopt import json from datetime import datetime -import rnaseq, bisulfite, chipseq def main(argv): opts, args = getopt.getopt(argv, "", ["assembly=", "taxon-id="]) @@ -22,45 +21,7 @@ def main(argv): date_str = datetime.now().date() - #Todo: Merge experiments as a single JSON - #Whole-Genome Bisulfite Sequencing experiments - # print("Processing WGB-Seq...") - # try: - # data = bisulfite.bisulfite_wrapper(assembly=assembly, taxon_id=taxon_id) - # filename = 'ENCODE.%s.%s.WGB-Seq.%s.json' % (taxon_id, assembly, date_str) - # output_file(data, filename) - # print("Done.") - # except Exception as e: - # print('An error occured while fetching WGB-Seq experiments: ' + e.message) - # print - - # #RNA-Sequencing experiments - # print("Processing RNA-Seq...") - # try: - # data = rnaseq.rna_seq_wrapper(assembly=assembly, taxon_id=taxon_id) - # filename = 'ENCODE.%s.%s.RNA-Seq.%s.json' % (taxon_id, assembly, date_str) - # output_file(data, filename) - # print("Done.") - # except Exception as e: - # print('An error occured while fetching RNA-Seq experiments: ' + str(e.message)) - # print - - #ChIP-Seq experiments - # targets = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3'] - targets = ['H3K27ac'] - - for t in targets: - print("Processing ChIP-Seq %s..." % t) - try: - data = chipseq.chip_seq_wrapper(assembly=assembly, taxon_id=taxon_id, target=t) - filename = 'ENCODE.%s.%s.ChIP-Seq_%s.%s.json' % (taxon_id, assembly, t, date_str) - output_file(data, filename) - print("Done.") - except Exception as e: - print('An error occured while fetching ChIP-Seq %s experiments: ' % t + e.message) - print - print("Operation completed.") diff --git a/IHEC_json_converter/general.py b/IHEC_json_converter/general.py index 7255591..5ce618b 100755 --- a/IHEC_json_converter/general.py +++ b/IHEC_json_converter/general.py @@ -3,7 +3,7 @@ f__author__ = 'kelley' import sys -import urlparse +from urllib.parse import urlparse import requests import json import os @@ -65,30 +65,10 @@ # in other files. def convert_to_IHEC_format(datasets, assembly, taxon_id): - # #Prepare the url - make sure it has the right parameters. - # url = prep_query_url(url, assembly, taxon_id, limit=limit) + + #Create hub description + hub_description = create_hub_description(assembly, taxon_id) - # #Get json response - # json_response = get_json_from_encode(url) - - # if len(json_response) == 0: - # raise Exception('This url returns no data.') - - # #Create hub description - # hub_description = create_hub_description(assembly, taxon_id) - - # #Create all of the datasets (one for each experiment, sample_id pair) - # print('%d experiments returned from this query.' % len(json_response)) - - # datasets = dict() - # datasetIdx = 0 - # for entry in json_response: - # datasetIdx += 1 - # print('%03d / %03d: \'%s\' ' % (datasetIdx, len(json_response), entry['accession']), end='') - # datasets.update(create_datasets(entry, assay_specific_additions)) - # print() - - # print('%d IHEC datasets created.' % len(datasets)) # Merge datasets. We can merge two datasets if either # 1. they have the same experiment_id and their sample_attributes are identical @@ -103,7 +83,7 @@ def convert_to_IHEC_format(datasets, assembly, taxon_id): experiment_id_to_datasets[experiment_id].append(dataset) datasets = dict() - for experiment_id, these_datasets in experiment_id_to_datasets.iteritems(): + for experiment_id, these_datasets in experiment_id_to_datasets.items(): if is_match_sample_attributes([x['sample_attributes'] for x in these_datasets]): print('Match found: ' + experiment_id) @@ -118,7 +98,7 @@ def convert_to_IHEC_format(datasets, assembly, taxon_id): # Merge 2 group_key_to_datasets = dict() nogroup_datasets = dict() - for accession, dataset in datasets.iteritems(): + for accession, dataset in datasets.items(): if dataset['award'] == 'ENCODE3': key = (dataset['experiment_attributes']['experiment_type'], dataset['sample_attributes']['sample_id']) if key not in group_key_to_datasets: @@ -128,7 +108,7 @@ def convert_to_IHEC_format(datasets, assembly, taxon_id): nogroup_datasets[accession] = dataset datasets = dict(nogroup_datasets) - for key, these_datasets in group_key_to_datasets.iteritems(): + for key, these_datasets in group_key_to_datasets.items(): if len(these_datasets) > 1: print('Collapsed: ', key, len(these_datasets)) @@ -139,24 +119,32 @@ def convert_to_IHEC_format(datasets, assembly, taxon_id): datasets[these_datasets[0][0]] = these_datasets[0][1] #Set is_main on tracks - for accession, dataset in datasets.iteritems(): + for accession, dataset in datasets.items(): for track_type in set(signal_mapping.values()): if track_type in dataset['browser']: track_hierarchy = determine_track(dataset) - set_main_track(dataset['browser'][track_type], track_hierarchy[track_type], accession, track_type) + if track_hierarchy: + set_main_track(dataset['browser'][track_type], track_hierarchy[track_type], accession, track_type) # Remove extra entries: - for accession, dataset in datasets.iteritems(): + + samples = dict() + + for accession, dataset in datasets.items(): del dataset['award'] del dataset['accession'] del dataset['sample_attributes']['replicate'] + dataset['sample_id'] = dataset['sample_attributes']['sample_id'] + samples[dataset['sample_id']] = dataset['sample_attributes'] + del dataset['sample_attributes'] print('%d IHEC datasets created after merge.' % len(datasets)) return { 'hub_description': hub_description, - 'datasets': datasets + 'datasets': datasets, + 'samples': samples } @@ -180,7 +168,7 @@ def determine_track(dataset): return BISULFATE_TRACK_HIEARCHY def get_epirr_id(item): - for registry_id in item['dbxref']: + for registry_id in item['dbxrefs']: if 'IHEC' in registry_id: return registry_id return None @@ -190,7 +178,7 @@ def merge_tracks(datasets): # We want to merge these datasets together, so we need to merge tracks. new_tracks = {} for dataset in datasets: - for track_type, tracks in dataset['browser'].iteritems(): + for track_type, tracks in dataset['browser'].items(): if track_type not in new_tracks: new_tracks[track_type] = [] @@ -200,14 +188,14 @@ def merge_tracks(datasets): #If a track has been assigned to multiple replicates, we might end up seeing it twice - check for that cleaned_tracks = dict() - for track_type, tracks in new_tracks.iteritems(): + for track_type, tracks in new_tracks.items(): url_to_track = dict() for track in tracks: if track['big_data_url'] not in url_to_track: url_to_track[track['big_data_url']] = [] url_to_track[track['big_data_url']].append(track) - for url, url_tracks in url_to_track.iteritems(): + for url, url_tracks in url_to_track.items(): if len(url_tracks) > 1: track = url_tracks[0] track['subtype'] = track['subtype'][0:track['subtype'].index('(rep')-1] @@ -219,7 +207,7 @@ def collapse_tracks(datasets): # We want to collapse these datasets together, so we need to collapse tracks. new_tracks = {} for dataset in datasets: - for track_type, tracks in dataset['browser'].iteritems(): + for track_type, tracks in dataset['browser'].items(): if track_type not in new_tracks: new_tracks[track_type] = [] new_tracks[track_type].extend(tracks) @@ -424,7 +412,7 @@ def prep_query_url(url, assembly, taxon_id, limit='all'): queries['format'] = ['json'] queries['assembly'] = [assembly] queries['replicates.library.biosample.donor.organism.taxon_id'] = [str(taxon_id)] - query = '&'.join(['&'.join([key + '=' + value for value in values]) for key, values in queries.iteritems()]) + query = '&'.join(['&'.join([key + '=' + value for value in values]) for key, values in queries.items()]) new_url = urlparse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, query, parsed.fragment)) print(new_url) return new_url diff --git a/IHEC_json_converter/reference_epigenome_experiments.py b/IHEC_json_converter/reference_epigenome_experiments.py index 5610cca..26aa5b8 100644 --- a/IHEC_json_converter/reference_epigenome_experiments.py +++ b/IHEC_json_converter/reference_epigenome_experiments.py @@ -8,29 +8,26 @@ API_ENDPOINT = "https://www.encodeproject.org" -REFERENCE_EPIGENOME_COLLECTION = API_ENDPOINT + "/search/?type=ReferenceEpigenome&format=json&limit=all" +REFERENCE_EPIGENOME_COLLECTION = API_ENDPOINT + "/search/?type=ReferenceEpigenome&format=json" -def run(): - hg19_dataset = dict() - response = requests.get(API_ENDPOINT) +def collect_experiments(assembly, taxon_id): + dataset = dict() + count = 0 + response = requests.get(REFERENCE_EPIGENOME_COLLECTION) # Iterate over Reference Epigenomes for reference_epigenome in response.json()['@graph']: - reference_epigenome_object = requests.get(reference_epigenome['@id']+'?format=json').json() + reference_epigenome_object = requests.get(API_ENDPOINT + reference_epigenome['@id']+'?format=json').json() reference_registry_id = get_epirr_id(reference_epigenome_object) - - # Iterate over experiments in Reference Epigenomes - for experiment_obj in reference_epigenome_object: - addition = determine_addition(experiment_obj) - if experiment_obj['assembly'][0] == 'hg19': - hg19_dataset.update(create_datasets(experiment_obj, reference_registry_id, addition)) - - with open('../output/hg19.json', 'w+') as outfile: - json.dumps(hg19_dataset, outfile, indent=4) - - - - -if __name__ == "__main__": - run() \ No newline at end of file + # Iterate over experiments in Reference Epigenomes + for experiment_obj in reference_epigenome_object['related_datasets']: + if 'assembly' in experiment_obj and experiment_obj['assembly'] and experiment_obj['assembly'][0] == assembly: + addition = determine_addition(experiment_obj) + if addition: + dataset.update(create_datasets(experiment_obj, reference_registry_id, addition)) + count = count + 1 + print(count) + + data = convert_to_IHEC_format(dataset, assembly, taxon_id) + return data \ No newline at end of file diff --git a/IHEC_json_converter/rnaseq.py b/IHEC_json_converter/rnaseq.py index 788e817..45f4acc 100755 --- a/IHEC_json_converter/rnaseq.py +++ b/IHEC_json_converter/rnaseq.py @@ -1,7 +1,6 @@ __author__ = 'kelley' import json -from general import convert_to_IHEC_format, set_main_track, signal_mapping VERSION='1.6' @@ -27,11 +26,11 @@ def rna_seq_addition(experiment, json_object): size_range = experiment['replicates'][0]['library']['size_range'] if size_range is None: - print 'Could not find size_range ' + experiment['accession'] + print('Could not find size_range ' + experiment['accession']) json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' json_object['experiment_attributes']['assay_type'] = 'RNA-seq' elif size_range not in size_range_to_experiment_type: - print 'Size range not found: ' + experiment['replicates'][0]['library']['size_range'] + print('Size range not found: ' + experiment['replicates'][0]['library']['size_range']) json_object['experiment_attributes']['experiment_type'] = 'RNA-seq' json_object['experiment_attributes']['assay_type'] = 'RNA-seq' else: From 8196433cfbea27ae86e5b0c8272e4a6cf5b03d05 Mon Sep 17 00:00:00 2001 From: Ulugbek Baymuradov Date: Fri, 28 Oct 2016 13:50:32 -0700 Subject: [PATCH 11/12] working version --- IHEC_json_converter/fetch_all_exp_jsons.py | 5 ++++- IHEC_json_converter/general.py | 18 +++++++++++++++--- .../reference_epigenome_experiments.py | 9 +++------ 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index ffe0676..59a6ed4 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -2,6 +2,7 @@ import getopt import json from datetime import datetime +from reference_epigenome_experiments import collect_experiments def main(argv): opts, args = getopt.getopt(argv, "", ["assembly=", "taxon-id="]) @@ -21,7 +22,9 @@ def main(argv): date_str = datetime.now().date() - + filename = 'ENCODE.{}.{}.{}.json'.format(taxon_id, assembly, date_str) + data = collect_experiments() + output_file(data, filename) diff --git a/IHEC_json_converter/general.py b/IHEC_json_converter/general.py index 5ce618b..66e1b56 100755 --- a/IHEC_json_converter/general.py +++ b/IHEC_json_converter/general.py @@ -90,6 +90,7 @@ def convert_to_IHEC_format(datasets, assembly, taxon_id): dataset = these_datasets[0] dataset['browser'] = merge_tracks(these_datasets) dataset['sample_attributes']['sample_id'] = '-'.join(sorted([x['sample_attributes']['sample_id'] for x in these_datasets])) + dataset['analysis_attributes'] = create_analysis_attributes(dataset) datasets[experiment_id] = dataset else: for dataset in these_datasets: @@ -229,7 +230,7 @@ def create_datasets(experiment, registry_id, assay_specific_additions): sample_attributes = [create_sample_attribute(replicate) for replicate in experiment['replicates']] #Create experiment_attributes - experiment_attributes = create_experiment_attributes(experiment) + experiment_attributes = create_experiment_attributes(experiment, registry_id) #Create tracks datasets = dict() @@ -253,7 +254,6 @@ def create_datasets(experiment, registry_id, assay_specific_additions): all_tracks[track_type] = these_tracks json_object = { - 'reference_registry_id': registry_id, 'sample_attributes': sample_attribute, 'experiment_attributes': experiment_attributes, 'browser': all_tracks, @@ -395,12 +395,24 @@ def add_SA_primary_tissue(sample_attribute, biosample): sample_attribute['tissue_type'] = biosample['biosample_term_name'] -def create_experiment_attributes(experiment): +def create_experiment_attributes(experiment, registry_id): return { + "reference_registry_id": registry_id, "assay_type": experiment['assay_term_name'], "experiment_ontology_uri": 'http://purl.obolibrary.org/obo/%s' % experiment['assay_term_id'] } + +def create_analysis_attributes(experiment): + # No mapping exists between the expected attributes and portal data for now + analysis_attributes = { + "analysis_group": "", + "alignment_software": "", + "alignment_software_version": "", + "analysis_software": "...", + "analysis_software_version": "" + } + return analysis_attributes ############################# Get ENCODE data ############################# # These functions get ENCODE data from the webservices. diff --git a/IHEC_json_converter/reference_epigenome_experiments.py b/IHEC_json_converter/reference_epigenome_experiments.py index 26aa5b8..2d9e173 100644 --- a/IHEC_json_converter/reference_epigenome_experiments.py +++ b/IHEC_json_converter/reference_epigenome_experiments.py @@ -8,26 +8,23 @@ API_ENDPOINT = "https://www.encodeproject.org" -REFERENCE_EPIGENOME_COLLECTION = API_ENDPOINT + "/search/?type=ReferenceEpigenome&format=json" +REFERENCE_EPIGENOME_COLLECTION = API_ENDPOINT + "/search/?type=ReferenceEpigenome&format=json&limit=all" -def collect_experiments(assembly, taxon_id): +def collect_experiments(assembly='hg19', taxon_id=9606): dataset = dict() - count = 0 response = requests.get(REFERENCE_EPIGENOME_COLLECTION) # Iterate over Reference Epigenomes for reference_epigenome in response.json()['@graph']: reference_epigenome_object = requests.get(API_ENDPOINT + reference_epigenome['@id']+'?format=json').json() reference_registry_id = get_epirr_id(reference_epigenome_object) - + # Iterate over experiments in Reference Epigenomes for experiment_obj in reference_epigenome_object['related_datasets']: if 'assembly' in experiment_obj and experiment_obj['assembly'] and experiment_obj['assembly'][0] == assembly: addition = determine_addition(experiment_obj) if addition: dataset.update(create_datasets(experiment_obj, reference_registry_id, addition)) - count = count + 1 - print(count) data = convert_to_IHEC_format(dataset, assembly, taxon_id) return data \ No newline at end of file From 3bda2ce825f741838b08c955a4fa1667037461e2 Mon Sep 17 00:00:00 2001 From: Ulugbek Baymuradov Date: Fri, 28 Oct 2016 14:25:34 -0700 Subject: [PATCH 12/12] run with python3 --- IHEC_json_converter/fetch_all_exp_jsons.py | 2 +- IHEC_json_converter/reference_epigenome_experiments.py | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/IHEC_json_converter/fetch_all_exp_jsons.py b/IHEC_json_converter/fetch_all_exp_jsons.py index 59a6ed4..e2c08fc 100755 --- a/IHEC_json_converter/fetch_all_exp_jsons.py +++ b/IHEC_json_converter/fetch_all_exp_jsons.py @@ -23,7 +23,7 @@ def main(argv): date_str = datetime.now().date() filename = 'ENCODE.{}.{}.{}.json'.format(taxon_id, assembly, date_str) - data = collect_experiments() + data = collect_experiments(assembly, taxon_id) output_file(data, filename) diff --git a/IHEC_json_converter/reference_epigenome_experiments.py b/IHEC_json_converter/reference_epigenome_experiments.py index 2d9e173..8e94359 100644 --- a/IHEC_json_converter/reference_epigenome_experiments.py +++ b/IHEC_json_converter/reference_epigenome_experiments.py @@ -21,7 +21,7 @@ def collect_experiments(assembly='hg19', taxon_id=9606): # Iterate over experiments in Reference Epigenomes for experiment_obj in reference_epigenome_object['related_datasets']: - if 'assembly' in experiment_obj and experiment_obj['assembly'] and experiment_obj['assembly'][0] == assembly: + if 'assembly' in experiment_obj and experiment_obj['assembly'] and assembly in experiment_obj['assembly']: addition = determine_addition(experiment_obj) if addition: dataset.update(create_datasets(experiment_obj, reference_registry_id, addition)) diff --git a/README.md b/README.md index 1e5e903..b5bc220 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ These scripts fetch ENCODE metadata from the ENCODE portal, and output it in the #### Example ``` cd IHEC_json_converter -python ./fetch_all_exp_jsons.py --assembly=hg19 --taxon-id=9606 +python3 ./fetch_all_exp_jsons.py --assembly=hg19 --taxon-id=9606 ``` ### Credits