import json
import os
import re
from random import randint
import sys
import warnings
import xml
import xml.etree.ElementTree as ET
from Bio import BiopythonWarning
from Bio import SeqIO
from Bio._py3k import Request as _Request
from Bio._py3k import urlopen as _urlopen
from Bio._py3k import urlencode as _urlencode
from Bio._py3k import _as_string
from . import utils
[docs]class Response(object):
"""Accepts and parses results from a call to the BOLD API.
Parses the data and returns a Response object.
Attributes:
items (list or str): Metadata from BOLD after parsing.
service (str): Alias of the method used to interact with BOLD.
"""
[docs] def _parse_data(self, service, result_string):
"""Parses XML response from BOLD.
Args:
service: Alias of the method used to interact with BOLD.
result_string: XML or JSON string returned from BOLD.
Returns:
List of all items as dictionaries.
"""
self.method = service
if result_string.strip() == '':
raise ValueError("BOLD did not return any result.")
if service == 'call_taxon_search' or service == 'call_taxon_data':
self._parse_json(result_string)
if service == 'call_specimen_data' or service == 'call_full_data' or \
service == 'call_id':
# Result_string could be data as tab-separated values (tsv)
# ugly hack for python 2.6 that does not have ET.ParseError
if sys.version.startswith('2.6'):
try:
self._parse_xml(result_string)
except xml.parsers.expat.ExpatError:
self.items = result_string
else:
try:
self._parse_xml(result_string)
except ET.ParseError:
self.items = result_string
if service == 'call_sequence_data':
self._parse_fasta(result_string)
if service == 'call_trace_files':
# file_contents is in binary form
self.file_contents = result_string
[docs] def _parse_json(self, result_string):
"""Parses JSON response from BOLD.
Args:
result_string: JSON string returned from BOLD.
Returns:
List of all items as dictionaries.
Raises:
ValueError: "BOLD did not return any result."
"""
items_from_bold = []
append = items_from_bold.append
response = json.loads(result_string)
if hasattr(response, 'items'):
# Is this a simple JSON and we got only one item?
simple_json = False
for i in response.keys():
res = re.search('^[0-9]+', i)
if res is None:
simple_json = True
if simple_json is True:
response = [response]
for string_id in response:
item = dict()
try:
json_obj = response[string_id]
except TypeError:
obj = string_id
json_obj = obj
if hasattr(json_obj, 'items'):
for k, v in json_obj.items():
if k == 'taxid':
item['tax_id'] = v
elif k == 'taxon':
item['taxon'] = v
elif k == 'tax_rank':
item['tax_rank'] = v
elif k == 'tax_division':
item['tax_division'] = v
elif k == 'parentid':
item['parent_id'] = v
elif k == 'parentname':
item['parent_name'] = v
elif k == 'taxonrep':
item['taxon_rep'] = v
else:
item[k] = v
append(item)
self.items = items_from_bold
else:
raise ValueError("BOLD did not return any result.")
[docs] def _parse_xml(self, result_string):
"""Parses XML response from BOLD.
Args:
result_string: XML string returned from BOLD.
Returns:
List of all items as dictionaries.
"""
items_from_bold = []
append = items_from_bold.append
if self.method == 'call_id':
xml_tag = 'match'
else:
xml_tag = 'record'
root = ET.fromstring(result_string)
for match in root.findall(xml_tag):
item = dict()
fields = [
# These pairs correspond to convertions of key names from BOLD
# to friendly versions:
#
# (key name from BOLD, friendlier key name)
# For call_id
('ID', 'bold_id'),
('sequencedescription', 'sequence_description'),
('database', 'database'),
('citation', 'citation'),
('taxonomicidentification', 'taxonomic_identification'),
('similarity', 'similarity'),
('specimen/url', 'specimen_url'),
('specimen/collectionlocation/country', 'specimen_collection_location_country'),
('specimen/collectionlocation/coord/lat', 'specimen_collection_location_latitude'),
('specimen/collectionlocation/coord/lon', 'specimen_collection_location_longitude'),
('record_id', 'record_id'),
('processid', 'process_id'),
('bin_uri', 'bin_uri'),
('specimen_identifiers/sampleid', 'specimen_identifiers_sample_id'),
('specimen_identifiers/catalognum', 'specimen_identifiers_catalog_num'),
('specimen_identifiers/fieldnum', 'specimen_identifiers_field_num'),
('specimen_identifiers/institution_storing', 'specimen_identifiers_institution_storing'),
('taxonomy/identification_provided_by', 'taxonomy_identification_provided_by'),
('taxonomy/phylum/taxon/taxID', 'taxonomy_phylum_taxon_id'),
('taxonomy/phylum/taxon/name', 'taxonomy_phylum_taxon_name'),
('taxonomy/class/taxon/taxID', 'taxonomy_class_taxon_id'),
('taxonomy/class/taxon/name', 'taxonomy_class_taxon_name'),
('taxonomy/order/taxon/taxID', 'taxonomy_order_taxon_id'),
('taxonomy/order/taxon/name', 'taxonomy_order_taxon_name'),
('taxonomy/family/taxon/taxID', 'taxonomy_family_taxon_id'),
('taxonomy/family/taxon/name', 'taxonomy_family_taxon_name'),
('taxonomy/genus/taxon/taxID', 'taxonomy_genus_taxon_id'),
('taxonomy/genus/taxon/name', 'taxonomy_genus_taxon_name'),
('taxonomy/species/taxon/taxID', 'taxonomy_species_taxon_id'),
('taxonomy/species/taxon/name', 'taxonomy_species_taxon_name'),
('specimen_details/voucher_type', 'specimen_details_voucher_type'),
('specimen_details/voucher_desc', 'specimen_details_voucher_desc'),
('specimen_details/extrainfo', 'specimen_details_extra_info'),
('specimen_details/lifestage', 'specimen_details_lifestage'),
('collection_event/collector', 'collection_event_collector'),
('collection_event/collectors', 'collection_event_collectors'),
('collection_event/collectiondate', 'collection_event_collection_date'),
('collection_event/coordinates/lat', 'collection_event_coordinates_latitude'),
('collection_event/coordinates/long', 'collection_event_coordinates_longitude'),
('collection_event/exactsite', 'collection_event_exact_site'),
('collection_event/country', 'collection_event_country'),
('collection_event/province', 'collection_event_province'),
('specimen_imagery/media/mediaID', 'specimen_imagery_media_id'),
('specimen_imagery/media/caption', 'specimen_imagery_media_caption'),
('specimen_imagery/media/metatags', 'specimen_imagery_media_metatags'),
('specimen_imagery/media/copyright', 'specimen_imagery_media_copyright'),
('specimen_imagery/media/image_file', 'specimen_imagery_media_image_file'),
('tracefiles/read/read_id', 'tracefiles_read_read_id'),
('tracefiles/read/run_date', 'tracefiles_read_run_date'),
('tracefiles/read/sequencing_center', 'tracefiles_read_sequencing_center'),
('tracefiles/read/direction', 'tracefiles_read_direction'),
('tracefiles/read/seq_primer', 'tracefiles_read_seq_primer'),
('tracefiles/read/trace_link', 'tracefiles_read_trace_link'),
('tracefiles/read/markercode', 'tracefiles_read_marker_code'),
('sequences/sequence/sequenceID', 'sequences_sequence_sequence_id'),
('sequences/sequence/markercode', 'sequences_sequence_marker_code'),
('sequences/sequence/genbank_accession', 'sequences_sequence_genbank_accession'),
('sequences/sequence/nucleotides', 'sequences_sequence_nucleotides'),
]
for field in fields:
if match.find(field[0]) is not None:
key = field[1]
matched = match.findall(field[0])
if len(matched) == 0:
item[key] = None
elif len(matched) == 1:
item[key] = match.find(field[0]).text
elif len(matched) > 1:
item[key] = [i.text for i in matched]
append(item)
self.items = items_from_bold
[docs] def _parse_fasta(self, result_string):
"""Parses string response from BOLD containing FASTA sequences.
Args:
result_string: FASTA sequences as string returned from BOLD.
Returns:
List of all items as Biopython SeqRecord objects.
"""
filename = "tmp_" + str(randint(1, 1000000)) + ".fas"
with open(filename, "w") as handle:
handle.write(result_string)
generator = SeqIO.parse(filename, "fasta")
self.items = [i for i in generator]
os.remove(filename)
[docs]class Request(object):
"""Constructs a :class:`Request <Request>`. Sends HTTP request.
Returns:
A :class:`Response <Response>` object.
"""
[docs] def get(self, service, **kwargs):
"""Does HTTP request to BOLD webservice.
Args:
service: The BOLD API alias to interact with.
kwargs: Paramenters send by users.
Returns:
A Response class containing parsed data as attribute `items`.
"""
params = ''
if service == 'call_id':
sequence = utils._prepare_sequence(kwargs['seq'])
params = _urlencode({'db': kwargs['db'], 'sequence': sequence})
if service == 'call_taxon_search':
if kwargs['fuzzy'] is True:
fuzzy = 'true'
else:
fuzzy = 'false'
params = _urlencode({
'taxName': kwargs['taxonomic_identification'],
'fuzzy': fuzzy,
})
if service == 'call_taxon_data':
if kwargs['include_tree'] is False:
params = _urlencode({
'taxId': kwargs['tax_id'],
'dataTypes': kwargs['data_type'],
})
else:
params = _urlencode({
'taxId': kwargs['tax_id'],
'dataTypes': kwargs['data_type'],
'includeTree': 'true',
})
if service == 'call_specimen_data' or service == 'call_sequence_data' or \
service == 'call_full_data' or service == 'call_trace_files':
payload = dict()
for k, v in kwargs.items():
if v is not None and k != 'url':
payload[k] = v
params = _urlencode(payload)
url = kwargs['url'] + "?" + params
req = _Request(url, headers={'User-Agent': 'BiopythonClient'})
handle = _urlopen(req)
response = Response()
if service == 'call_trace_files':
binary_result = handle.read()
response._parse_data(service, binary_result)
else:
result = _as_string(handle.read())
response._parse_data(service, result)
return response
[docs]def request(service, **kwargs):
"""Builds our request based on given arguments. Used internally.
Args:
service: The BOLD API alias to interact with. Examples: `call_id`,
`call_taxon_search`.
kwargs: Arguments passed by users when calling our methods.
Returns:
Request object with service alias, correct URL and user arguments.
"""
req = Request()
if service == 'call_id':
# User wants the service `call_id`. So we need to use this URL:
url = "http://boldsystems.org/index.php/Ids_xml"
return req.get(service=service, url=url, **kwargs)
if service == 'call_taxon_search':
url = "http://www.boldsystems.org/index.php/API_Tax/TaxonSearch"
return req.get(service=service, url=url, **kwargs)
if service == 'call_taxon_data':
url = "http://www.boldsystems.org/index.php/API_Tax/TaxonData"
return req.get(service=service, url=url, **kwargs)
if service == 'call_trace_files':
url = "http://www.boldsystems.org/index.php/API_Public/trace"
args_returning_lots_of_data = ['institutions', 'researchers', 'geo']
for arg in args_returning_lots_of_data:
if kwargs[arg] is not None:
warnings.warn('Requesting ``' + arg + '`` data from BOLD will '
'possibly return a lot of records and the transfer '
'of data might take a lot of time to complete as '
'many Megabytes are expected.',
BiopythonWarning
)
return req.get(service=service, url=url, **kwargs)
if service == 'call_specimen_data':
url = "http://www.boldsystems.org/index.php/API_Public/specimen"
args_returning_lots_of_data = ['institutions', 'researchers', 'geo']
for arg in args_returning_lots_of_data:
if kwargs[arg] is not None:
warnings.warn('Requesting ``' + arg + '`` data from BOLD will '
'possibly return a lot of records and the transfer '
'of data might take a lot of time to complete as '
'many Megabytes are expected.',
BiopythonWarning
)
return req.get(service=service, url=url, **kwargs)
if service == 'call_sequence_data':
url = "http://www.boldsystems.org/index.php/API_Public/sequence"
elif service == 'call_full_data':
url = "http://www.boldsystems.org/index.php/API_Public/combined"
args_returning_lots_of_data = ['institutions', 'researchers', 'geo']
for arg in args_returning_lots_of_data:
if kwargs[arg] is not None:
warnings.warn('Requesting ``' + arg + '`` data from BOLD will '
'possibly return a lot of records and the transfer '
'of data might take a lot of time to complete as '
'many Megabytes are expected.',
BiopythonWarning
)
return req.get(service=service, url=url, **kwargs)
[docs]def call_id(seq, db):
"""Call the ID Engine API
http://www.boldsystems.org/index.php/resources/api?type=idengine
Args:
seq: DNA sequence string or seq_record object.
db: The BOLD database of available records. Choices: ``COX1_SPECIES``,'
``COX1``, ``COX1_SPECIES_PUBLIC``, ``COX1_L640bp``.
Returns:
List of dictionaries containing metadata. One dictionary per BOLD record.
Examples:
>>> import bold
>>> seq = 'TTTTTGGTATTTGAGCAGGAATAGTAGGAACTTCTCTCAGTTTAATTATTCGAATAGAATTAGGTAATCCAGGTTTCTTAATTGGAGATGATCAAATTTATAATACTATTGTAACAGCCCATGCTTTTATTATAATTTTTTTTATAGTTATACCTATTGTAATTGGAGGATTTGGAAATTGACTAGTTCCCCTAATATTAGGTGCACCTGATATAGCTTTCCCTCGTATAAATAATATAAGATATTGACTACTTCCACCATCTTTAATATTATTAATTTCAAGTAGTATTGTAGAAAATGGAGCTGGAACAGGTTGAACAGTTTACCCCCCTCTTTCCTCTAATATTGCTCATAGAGGAACCTCAGTAGACTTAGCAATTTTTTCTCTTCATTTAGCTGGTATTTCTTCTATTTTAGGAGCTATTAATTTTATTACTACAATTATTAATATACGAGTTAATGGAATATCCTATGATCAAATACCTTTATTTGTTTGAGCTGTTGGAATTACAGCTCTTCTTTTACTTCTTTCTTTACCTGTTTTAGCAGGAGCTATCACAATACTTCTTACAGATCGAAATTTAAATACATCATTTTTTGATCCTGCAGGAGGAGGTGATCCAATTTTATACCAACATTTATTTTGATTTTTTGGTCACCC'
>>> res = bold.call_id(seq, db='COX1')
>>> item = res.items[1]
>>> item['bold_id'] # this is the ID assigned by BOLD
'GBLN3590-14'
"""
return request('call_id', seq=seq, db=db)
[docs]def call_taxon_search(taxonomic_identification, fuzzy=None):
"""Call the TaxonSearch API
http://www.boldsystems.org/index.php/resources/api?type=taxonomy#Ideasforwebservices-SequenceParameters
Args:
taxonomic_identification: species or any taxon name
fuzzy: False by default
Returns:
List of dictionaries containing metadata. One dictionary per BOLD record.
Raises:
ValueError: If `fuzzy` is not True or False.
Examples:
>>> import bold
>>> taxonomic_identification = 'Euptychia ordinata'
>>> res = bold.call_taxon_search(taxonomic_identification, fuzzy=False)
>>> item = res.items[0] # there can be more than one result
>>> item['tax_id']
302603
"""
if fuzzy is None or fuzzy is False:
fuzzy = False
elif fuzzy is True:
fuzzy = True
else:
raise ValueError('Invalid value for ``fuzzy``. Use True or False.')
return request('call_taxon_search',
taxonomic_identification=taxonomic_identification,
fuzzy=fuzzy
)
[docs]def call_taxon_data(tax_id, data_type=None, include_tree=None):
"""Call the TaxonData API. It has several methods to get additional
metadata.
Args:
tax_id: Taxon to get information for.
data_type: ``basic|all|images``. Default is ``basic``.
include_tree: Optional. Also returns information for parent taxa. True or
False (default).
Returns:
List of dictionaries containing metadata for a given taxon.
Raises:
ValueError: If `include_tree` is not True or False.
Examples:
>>> import bold
>>> tax_id = 88899
>>> res = bold.call_taxon_data(tax_id, data_type='basic,images')
>>> item = res.items[0]
>>> item['taxon']
'Momotus'
>>> [(i['image'], i['photographer']) for i in item['images']]
[('BSPBB/MJM_7364_IMG_2240_d+1345758620.JPG', 'Oscar Lopez')]
"""
if data_type is None:
# We will use by default data_type='basic'
data_type = 'basic'
if include_tree is None or include_tree is False:
include_tree = False
elif include_tree is True:
include_tree = True
else:
raise ValueError('Invalid value for ``include_tree``. Use True or False.')
return request('call_taxon_data', tax_id=tax_id, data_type=data_type,
include_tree=include_tree)
[docs]def call_specimen_data(taxon=None, ids=None, bin=None, container=None,
institutions=None, researchers=None, geo=None,
format=None):
"""Call the Specimen Data Retrieval API.
Args:
taxon: Taxon name including the ranks: phylum, class, order, family,
subfamily, genus and species. Example: `taxon='Bos taurus'`.
ids: Sample ids, process ids, museum ids and field ids. Example:
`ids='ACRJP618|ACRJP619-11'`.
bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`.
container: Containers include project codes and dataset codes. Example:
`container='DS-EZROM'`.
institutions: Name of Specimen Storing Sites. Example:
`'institutions=Biodiversity Institute of Ontario'`.
researchers: Collectors and specimen indenfitiers. Example:
`researchers='Thibaud Decaens'`.
geo: Geographic sites such as countries, provinces and states. Example:
`geo='Alaska'`.
format: Optional: ``format='tsv'`` will return results a string
containing data in tab-separated values. If not used, the
data will be returned as dictionary (default behaviour).
Raises:
ValueError: If `format` is not None and not 'tsv'.
Returns:
Matching specimen data records as string in TSV format or as list of
dictionaries.
Examples:
>>> import bold
>>> bin = 'BOLD:AAE2777'
>>> res = bold.call_specimen_data(bin=bin)
>>> class_taxon_names = [item['taxonomy_class_taxon_name'] for item in res.items]
>>> class_taxon_names[0]
'Insecta'
"""
if format is not None and format != 'tsv':
raise ValueError('Invalid value for ``format``')
return request('call_specimen_data', taxon=taxon, ids=ids, bin=bin,
container=container, institutions=institutions,
researchers=researchers, geo=geo, format=format
)
[docs]def call_sequence_data(taxon=None, ids=None, bin=None, container=None,
institutions=None, researchers=None, geo=None,
marker=None):
"""Call the Specimen Data Retrieval API.
Args:
taxon: Taxon name including the ranks: phylum, class, order, family,
subfamily, genus and species. Example: `taxon='Bos taurus'`.
ids: Sample ids, process ids, museum ids and field ids. Example:
`ids='ACRJP618|ACRJP619-11'`.
bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`.
container: Containers include project codes and dataset codes. Example:
`container='DS-EZROM'`.
institutions: Name of Specimen Storing Sites. Example:
`'institutions=Biodiversity Institute of Ontario'`.
researchers: Collectors and specimen indenfitiers. Example:
`researchers='Thibaud Decaens'`.
geo: Geographic sites such as countries, provinces and states. Example:
`geo='Alaska'`.
marker: Genetic marker code. Example: `marker='COI-5P'`.
Returns:
DNA sequences of matching records in FASTA format.
Examples:
>>> import bold
>>> res = bold.call_sequence_data(taxon='Hermeuptychia', geo='Peru')
>>> items = res.items
>>> [item.id for item in items]
['GBLN4477-14|Hermeuptychia', 'GBLN4478-14|Hermeuptychia', 'GBLN4479-14|Hermeuptychia']
"""
return request('call_sequence_data', taxon=taxon, ids=ids, bin=bin,
container=container, institutions=institutions,
researchers=researchers, geo=geo, marker=marker
)
[docs]def call_full_data(taxon=None, ids=None, bin=None, container=None,
institutions=None, researchers=None, geo=None,
marker=None, format=None):
"""Call the Full Data Retrieval API (combined).
Args:
taxon: Taxon name including the ranks: phylum, class, order, family,
subfamily, genus and species. Example: `taxon='Bos taurus'`.
ids: Sample ids, process ids, museum ids and field ids. Example:
`ids='ACRJP618|ACRJP619-11'`.
bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`.
container: Containers include project codes and dataset codes. Example:
`container='DS-EZROM'`.
institutions: Name of Specimen Storing Sites. Example:
`'institutions=Biodiversity Institute of Ontario'`.
researchers: Collectors and specimen indenfitiers. Example:
`researchers='Thibaud Decaens'`.
geo: Geographic sites such as countries, provinces and states. Example:
`geo='Alaska'`.
marker: Genetic marker code. Example: `marker='COI-5P'`.
format: Optional. `format='tsv'`.
Returns:
The data is returned as a string in TSV format or list of dicts parsed
from a XML file.
Raises:
ValueError: If `format` is not None or 'tsv'.
Examples:
>>> import bold
>>> res = bold.call_full_data(taxon='Hermeuptychia', geo='Peru')
>>> item = res.items[0]
>>> [item['sequences_sequence_genbank_accession'] for item in res.items]
['KF466142', 'KF466143', 'KF466144']
"""
if format is not None and format != 'tsv':
raise ValueError('Invalid value for ``format``')
return request('call_full_data', taxon=taxon, ids=ids, bin=bin,
container=container, institutions=institutions,
researchers=researchers, geo=geo, marker=marker, format=format
)
[docs]def call_trace_files(taxon=None, ids=None, bin=None, container=None,
institutions=None, researchers=None, geo=None,
marker=None):
"""Trace files can be retrieved from BOLD by querying with several parameters.
Args:
taxon: Taxon name including the ranks: phylum, class, order, family,
subfamily, genus and species. Example: `taxon='Bos taurus'`.
ids: Sample ids, process ids, museum ids and field ids. Example:
`ids='ACRJP618|ACRJP619-11'`.
bin: BIN stands for Barcode Index number URI. Example: `bin='BOLD:AAA5125'`.
container: Containers include project codes and dataset codes. Example:
`container='DS-EZROM'`.
institutions: Name of Specimen Storing Sites. Example:
`'institutions=Biodiversity Institute of Ontario'`.
researchers: Collectors and specimen indenfitiers. Example:
`researchers='Thibaud Decaens'`.
geo: Geographic sites such as countries, provinces and states. Example:
`geo='Alaska'`.
marker: Genetic marker code. Example: `marker='COI-5P'`.
Returns:
A TAR file consisting of compressed Trace Files (traces in either
.ab1 or .scf format) along with a file listing the Process ID, taxon and
marker for each Trace File included.
Examples:
>>> import bold
>>> res = bold.call_trace_files(taxon='Euptychia mollis',
... institutions='York University')
>>> with open("trace_files.tar", "wb") as handle:
... handle.write(res.file_contents)
4106240
"""
return request('call_trace_files', taxon=taxon, ids=ids, bin=bin,
container=container, institutions=institutions,
researchers=researchers, geo=geo, marker=marker
)