Source code for geolink_formatter.parser

# -*- coding: utf-8 -*-
import datetime

import pkg_resources
import requests
from lxml.etree import XMLSchema, DTD, DocumentInvalid
from defusedxml.lxml import fromstring
from geolink_formatter.entity import Document, File


class SCHEMA(object):
    """Provides the available geoLink schema versions."""

    V1_0_0 = '1.0.0'
    """str: geoLink schema version 1.0.0"""

    V1_1_0 = '1.1.0'
    """str: geoLink schema version 1.1.0"""

    V1_1_1 = '1.1.1'
    """str: geoLink schema version 1.1.1"""

    V1_2_0 = '1.2.0'
    """str: geoLink schema version 1.2.0"""

    V1_2_1 = '1.2.1'
    """str: geoLink schema version 1.2.1"""

    V1_2_2 = '1.2.2'
    """str: geoLink schema version 1.2.2"""


[docs]class XML(object): _date_format = '%Y-%m-%d' """str: Format of date values in XML.""" def __init__(self, host_url=None, version='1.2.2', dtd_validation=False, xsd_validation=True): """Create a new XML parser instance containing the geoLink XSD for validation. Args: host_url (str): URL of the OEREBlex host to resolve relative URLs. The complete URL until but without the */api* part has to be set, starting with *http://* or *https://*. version (str): The version of the geoLink schema to be used. Defaults to `1.2.0`. dtd_validation (bool): Enable/disable validation of document type definition (DTD). Optional, defaults to False. xsd_validation (bool): Enable/disable validation against XML schema (XSD). Optional, defaults to True. """ self._host_url = host_url self._version = version self._dtd_validation = dtd_validation self._xsd_validation = xsd_validation xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version)) if self._xsd_validation: with open(xsd, encoding='utf-8') as f: self._schema = XMLSchema(fromstring(f.read())) @property def host_url(self): """str: The OEREBlex host URL to resolve relative URLs.""" return self._host_url def _parse_xml(self, xml): """Parses the specified XML string and validates it against the geoLink XSD. Args: xml (str or bytes): The XML to be parsed. Returns: lxml.etree._Element: The root element of the parsed geoLink XML. Raises: lxml.etree.XMLSyntaxError: Raised on failed validation. """ if isinstance(xml, bytes): content = fromstring(xml) else: content = fromstring(xml.encode('utf-16be')) if self._xsd_validation: self._schema.assertValid(content) if self._dtd_validation: dtd = content.getroottree().docinfo.internalDTD if isinstance(dtd, DTD): dtd.assertValid(content) else: raise DocumentInvalid('Missing DTD in parsed content') return content
[docs] def from_string(self, xml): """Parses XML into internal structure. The specified XML string is gets validated against the geoLink XSD on parsing. Args: xml (str or bytes): The XML to be parsed. Returns: list[geolink_formatter.entity.Document]: A list containing the parsed document elements. Raises: lxml.etree.XMLSyntaxError: Raised on failed validation. """ root = self._parse_xml(xml) documents = list() for document_el in root.iter('document'): doc_id = document_el.attrib.get('id') doctype = document_el.attrib.get('doctype') # Mangle doc_id for notices. While IDs are unique between decrees # and edicts, this is not the case when adding notices to the mix. if doctype == 'notice': doc_id += doctype if doc_id and doc_id not in [doc.id for doc in documents]: files = list() for file_el in document_el.iter('file'): href = file_el.attrib.get('href') if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'): href = u'{host}{href}'.format(host=self.host_url, href=href) files.append(File( title=file_el.attrib.get('title'), description=file_el.attrib.get('description'), href=href, category=file_el.attrib.get('category') )) enactment_date = document_el.attrib.get('enactment_date') if enactment_date: enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date() decree_date = document_el.attrib.get('decree_date') if decree_date: decree_date = datetime.datetime.strptime(decree_date, self._date_format).date() abrogation_date = document_el.attrib.get('abrogation_date') if abrogation_date: abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date() documents.append(Document( files=files, id=doc_id, category=document_el.attrib.get('category'), doctype=document_el.attrib.get('doctype'), federal_level=document_el.attrib.get('federal_level'), authority=document_el.attrib.get('authority'), authority_url=document_el.attrib.get('authority_url'), title=document_el.attrib.get('title'), number=document_el.attrib.get('number'), abbreviation=document_el.attrib.get('abbreviation'), instance=document_el.attrib.get('instance'), type=document_el.attrib.get('type'), subtype=document_el.attrib.get('subtype'), decree_date=decree_date, enactment_date=enactment_date, abrogation_date=abrogation_date, cycle=document_el.attrib.get('cycle'), municipality=document_el.attrib.get('municipality'), index=document_el.attrib.get('index') )) return documents
[docs] def from_url(self, url, params=None, **kwargs): """Loads the geoLink of the specified URL and parses it into the internal structure. Args: url (str): The URL of the geoLink to be parsed. params (dict): Dictionary or bytes to be sent in the query string for the :class:`requests.models.Request`. **kwargs: Optional arguments that ``requests.api.request`` takes. Returns: list[geolink_formatter.entity.Document]: A list containing the parsed document elements. Raises: lxml.etree.XMLSyntaxError: Raised on failed validation. requests.HTTPError: Raised on failed HTTP request. """ response = requests.get(url, params=params, **kwargs) if response.status_code == 200: return self.from_string(response.content) else: response.raise_for_status()