Luke Ross

lxmlmeld

6 releases git clone https://lukeross.name/projects/lxmlmeld.git/

Meld-like templating using lxml.

Commit 949b3aeb41049f4ef6f15d2422f4d5255ad5c47d

Re-read the meld3 documentation and layout to improve compatability.

Committed 17 Mar 2017 by Luke Ross

README.md

@@ -8,16 +8,17 @@ I previously forked Meld3 and made it use lxml, but the code wasn't elegant
 as it didn't use much of lxml's additional features. This version is a
 from-the-ground-up rewrite based on the Meld3 documentation and test suite.
 
-## Differences
+## Key Differences
 
- - The undocumented `fillmeldhtmlform()` is not implemented
+ - `fillmeldhtmlform()` is not implemented (it's rather too magic)
+ - `findwithattrib` is not implemented (as unclear how it interacts with
+   namespaces; use lxml's finders or xpath)
  - replace() follows the meld3 syntax; the lxml call of the same name is
    renamed `replace_child()`
  - The property `parent` doesn't exist; use `getparent()`
- - You can pass lxml Elements or lists of Elements to `replace()` and
-   `content()`
- - When using `structure=True` the content must be parsable as XML
- - libxml2 uses doctype sniffing for XHTML, so `write_xml()` and
-   `write_xhtml()` only differ by default doctype
+ - You can pass lxml Elements or lists of Elements to `replace()`,
+   `content()` and `fillmelds()`
+ - When using `structure=True` the content must be broadly parsable as XML
  - `repeat` inserts adjacent to the original node, not at the end of the
    parent
+ - doctypes to the `write_*` functions can be plain strings


lxmlmeld/__init__.py

@@ -3,13 +3,46 @@ from copy import deepcopy
 from lxml import etree
 
 NS = "http://www.plope.com/software/meld3"
-_html_doctype = (
-    "HTML", "-//W3C//DTD HTML 4.01 Transitional//EN",
-    "http://www.w3.org/TR/html4/loose.dtd"
-)
-_xhtml_doctype = (
-    "html", "-//W3C//DTD XHTML 1.0 Transitional//EN",
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
+
+
+class _doctype_dict(object):
+    def __init__(self, **kwargs):
+        self._doctypes = kwargs
+
+    def __getattr__(self, name):
+        try:
+            return self._doctypes[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __getitem__(self, name):
+        return self._doctypes[name]
+
+    def keys(self):
+        return self._doctypes.keys()
+
+    def items(self):
+        return self._doctypes.items()
+
+    def __iter__(self):
+        return iter(self._doctypes)
+
+    def __repr__(self):
+        return repr(self._doctypes)
+
+    def get(self, *args, **kwargs):
+        return self._doctypes.get(*args, **kwargs)
+
+
+doctypes = _doctype_dict(
+    html='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
+         '"http://www.w3.org/TR/html4/loose.dtd">',
+    html_strict='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
+                '"http://www.w3.org/TR/html4/strict.dtd">',
+    xhtml='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '
+          '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
+    xhtml_strict='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
+                 '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
 )
 
 
@@ -20,12 +53,23 @@ class Element(etree.ElementBase):
         )
 
     def clone(self, parent=None):
+        """
+        Copy a element, including all of its children. The new element
+        initially is not associated with the document, but if an element
+        is passed in as parent the newly-copied element will be appended
+        to this parent element. Returns the new element.
+        """
         ret = deepcopy(self)
         if parent is not None:
             parent.append(ret)
         return ret
 
     def findmeld(self, name, default=None):
+        """
+        Searches this element and all children for any with a meld:id
+        attribute with value equal to the name parameter. Returns
+        default (None if not supplied) if the node account be found.
+        """
         ret = self.xpath(
             "descendant-or-self::*[@meld:id='{}']".format(name),
             namespaces={"meld": NS}
@@ -33,14 +77,31 @@ class Element(etree.ElementBase):
         return ret[0] if ret else default
 
     def findmelds(self):
+        """
+        Returns an iterable of all elements (this one or children) with a
+        meld:id attribute (of any value).
+        """
         return self.xpath(
             "descendant-or-self::*[@meld:id]", namespaces={"meld": NS}
         )
 
     def meldid(self):
+        """
+        Returns the value of the meld:id attribute of this element, or None
+        if this element does not have a meld:id attribute.
+        """
         return self.get(etree.QName(NS, "id").text)
 
     def repeat(self, iterable, childname=None):
+        """
+        Given an iterable, repeat the target element the same number of times
+        as the length of the iterable. Returns an iterable of (new_element,
+        iterable_data_item) pairs, from which you can mutate new_element as
+        desired.
+
+        The target element is by default this element, but if a meld:id is pass
+        in as childname then this element will be found and used instead.
+        """
         thing = self.findmeld(childname) if childname else self
         tail = thing.tail
         thing.tail = None
@@ -66,6 +127,11 @@ class Element(etree.ElementBase):
         thing.getparent().remove(thing)
 
     def replace_child(self, old_element, new_element):
+        """
+        Looks for this old_element as a direct child of this element, removes
+        it and replaces it in the same position with new_element. This
+        is lxml's Element.replace()
+        """
         super(Element, self).replace(old_element, new_element)
 
     def replace(self, text, structure=False):
@@ -84,12 +150,14 @@ class Element(etree.ElementBase):
         If the argument is a list or tuple it is expected to be a list of
         lxml Element nodes which will all be used as the replacement.
 
-        Returns nothing.
+        If the element had no parent to update this call does nothing and
+        returns None.
         """
         parent = self.getparent()
         if parent is None:
             return
 
+        idx = self.parentindex()
         if isinstance(text, (list, tuple)):
             for node in text:
                 parent.insert(self.parentindex(), node)
@@ -117,7 +185,16 @@ class Element(etree.ElementBase):
                     parent.text = text
             parent.remove(self)
 
+        return idx
+
     def content(self, text, structure=False):
+        """
+        Sets the content of this element. It removes all of the text and child
+        elements before doing so. You can pass in text, an lxml element or list
+        of lxml elements to use as the new contents. If you pass in text and
+        set structure to true then the text will be treated as a fragment of
+        XML, parsed and inserted. Returns nothing.
+        """
         if isinstance(text, (list, tuple)):
             self.text = None
             self[:] = list(text)
@@ -132,10 +209,24 @@ class Element(etree.ElementBase):
             self.text = text
 
     def attributes(self, **kwargs):
+        """
+        Attributes are set on the node using the argument names given.
+        Existing attributes with the same name are overwritten. Returns
+        nothing.
+        """
         for k, v in kwargs.items():
             self.set(k, v)
 
     def fillmelds(self, **kwargs):
+        """
+        For each kwarg find the element with the meld:id with that argument
+        name and set the content of the element to the value of the argument.
+        Anything that can be passed to content() can be used as an argument
+        value.
+
+        Any arguments with names that don't correspond meld:ids in the
+        document are returned as a list of argument names.
+        """
         missing = set()
         for k, v in kwargs.items():
             ele = self.findmeld(k)
@@ -145,14 +236,31 @@ class Element(etree.ElementBase):
                 missing.add(k)
         return list(missing)
 
+    def __mod__(self, **kwargs):
+        """
+        Alias for fillmelds.
+        """
+        self.fillmelds(**kwargs)
+
     def parentindex(self):
+        """
+        Gives the array index of this node on it's parent. Returns None
+        if this element has no parent.
+        """
         parent = self.getparent()
         return parent.index(self) if parent is not None else None
 
     def deparent(self):
+        """
+        Removes this element from it's parent (ie. it removes it from the
+        document). If this element is already unparented it silently does
+        nothing. Returns the old index.
+        """
+        idx = self.parentindex()
         parent = self.getparent()
         if parent is not None:
             parent.remove(self)
+        return idx
 
     def _clone_without_own_ns(self):
         new = self.clone()
@@ -167,20 +275,25 @@ class Element(etree.ElementBase):
         etree.cleanup_namespaces(new)
         return new
 
-    @staticmethod
-    def _get_doctype(doctype):
-        if not isinstance(doctype, (list, tuple)):
-            return doctype
-        name, public, system = doctype
-        return '<!DOCTYPE {} PUBLIC "{}" "{}">'.format(*doctype)
-
     def write_xml(self, file, encoding=None, doctype=None, fragment=False,
                   declaration=True, pipeline=False, _kwargs={"method": "xml"},
                   _doc=None):
+        """
+        Writes this document as XML to a file (filename or file-like object).
+        The document will use the encoding and doctype specified. Doctype
+        can be a string or tuple, and none is emitted if set to None (the
+        default). An XML declaration is emitted by default but can be omitted
+        if declaration is set to False.  If fragment is true then no doctype
+        or XML declaration is emitted regardless of their values. By default
+        all meld:ids are stripped from the serialised output, but if pipeline
+        is set to true then they are serialised.
+        """
         kwargs = {k: v for k, v in _kwargs.items()}
         kwargs.update(xml_declaration=declaration, encoding=encoding)
         if doctype:
-            kwargs.update(doctype=self._get_doctype(doctype))
+            if isinstance(doctype, (tuple, list)):
+                doctype = '<!DOCTYPE {} PUBLIC "{}" "{}">'.format(*doctype)
+            kwargs.update(doctype=doctype)
         if fragment:
             kwargs.update(doctype=None, xml_declaration=False)
 
@@ -191,61 +304,78 @@ class Element(etree.ElementBase):
         else:
             doc = self._clone_without_own_ns()
 
+        ret = etree.tostring(doc, **kwargs)
         if file:
             # ElementTree.write() doesn't support doctype
-            file.write(etree.tostring(doc, **kwargs))
+            try:
+                file.write(ret)
+            except AttributeError:
+                with open(file, "eb") as fh:
+                    fh.write(ret)
         else:
-            return etree.tostring(doc, **kwargs)
+            return ret
 
-    def write_xhtml(self, file, encoding=None, doctype=_xhtml_doctype,
+    def write_xhtml(self, file, encoding=None, doctype=doctypes.xhtml,
                     fragment=False, declaration=False, pipeline=False):
-        doctype = self._get_doctype(doctype)
-        if not(doctype and "-//W3C//DTD XHTML" in doctype):
-            # libxml handles xhtml by doctype-sniffing
-            raise ValueError("Invalid doctype for XHTML")
-
-        if fragment:
-            declaration = False
-
-        if pipeline:
-            ret = self.write_xml(
-                None, encoding=encoding, doctype=doctype, pipeline=True,
-                declaration=declaration
-            )
-        else:
-            # cleaning up namespaces upsets lxml, need to re-parse :-(
-            intermediate = self.write_xml(
-                None, encoding=encoding, doctype=doctype,
-                declaration=declaration
-            )
-            intermediate = etree.fromstring(intermediate)
-            ret = self.write_xml(
-                None, encoding=encoding, doctype=doctype, pipeline=True,
-                declaration=declaration, _doc=intermediate
-            )
-
-        if fragment:
-            ret = re.sub(rb'^.*?<!DOCTYPE.*?>\s+', b'', ret, re.S)
+        """
+        Writes this document as XHTML to a file (filename or file-like object).
+        The document will use the encoding and doctype specified. Doctype
+        can be a string or tuple. It defaults to XHTML 1.0 Transitional. An XML
+        declaration is not emitted by default but can be if declaration is set
+        to true.  If fragment is true then no doctype or XML declaration is
+        emitted regardless of their values. By default all meld:ids are
+        stripped from the serialised output, but if pipeline is set to true
+        then they are serialised.
+        """
 
-        if file:
-            file.write(ret)
-        else:
-            return ret
+        # libxml2/lxml is seriously finicky about XHTML and does it based on
+        # sniffing the doctype, apparently at parse time. Furthermore
+        # _cleanup_namespacesStart is enough to break the magic. Start by
+        # serialising as XML with an XHTML doctype and then re-parsing to get
+        # the magic before emitting with the correct options.
+        intermediate = self.write_xml(
+            None, encoding=encoding, doctype=doctypes.xhtml,
+            fragment=False, declaration=True, pipeline=pipeline
+        )
+        intermediate = etree.fromstring(intermediate)
+        return self.write_xml(
+            file, encoding=encoding, doctype=doctype, pipeline=True,
+            declaration=declaration, fragment=fragment, _doc=intermediate
+        )
 
-    def write_html(self, file, encoding=None, doctype=_html_doctype,
+    def write_html(self, file, encoding=None, doctype=doctypes.html,
                    fragment=False):
+        """
+        Writes this document as HTML to a file (filename or file-like object).
+        The document will use the encoding and doctype specified. Doctype
+        can be a string or tuple. It defaults to HTML 4.01 Transitional.
+        If fragment is true then no doctype is emitted regardless of the
+        doctype parameter value.
+        """
         return self.write_xml(
             file, encoding=encoding, doctype=doctype, fragment=fragment,
             _kwargs={"method": "html"}
         )
 
     def write_xmlstring(self, *args, **kwargs):
+        """
+        Returns the document as a bytes string, formatted as XML. See
+        write_xml for the options you can specify to this call.
+        """
         return self.write_xml(None, *args, **kwargs)
 
     def write_xhtmlstring(self, *args, **kwargs):
+        """
+        Returns the document as a bytes string, formatted as XHTML. See
+        write_xhtml for the options you can specify to this call.
+        """
         return self.write_xhtml(None, *args, **kwargs)
 
     def write_htmlstring(self, *args, **kwargs):
+        """
+        Returns the document as a bytes string, formatted as HTML. See
+        write_html for the options you can specify to this call.
+        """
         return self.write_html(None, *args, **kwargs)
 
 
@@ -293,6 +423,9 @@ def _fix_html(tree):
 
 
 def parse_html(html):
+    """
+    Parses HTML from a file-like object. Returns the root element.
+    """
     t = etree.parse(html, _parser(etree.HTMLParser)).getroot()
     _fix_html(t)
     _check_tree(t)
@@ -300,6 +433,9 @@ def parse_html(html):
 
 
 def parse_htmlstring(html):
+    """
+    Parses a str or unicode of HTML. Returns the root element.
+    """
     t = etree.fromstring(html, _parser(etree.HTMLParser))
     _fix_html(t)
     _check_tree(t)


tests/test_parse_serialise.py

@@ -263,8 +263,3 @@ class XHTMLTests(TestCase):
                 self.assertNotIn(absent, serialised_default)
             for txt in (serialised_on, serialised_off, serialised_default):
                 self.assertIn(b"<br /><p></p></body></html>", txt)
-
-        with self.assertRaises(ValueError):
-            doc.write_xhtmlstring(doctype='<!DOCTYPE note SYSTEM "Note.dtd">')
-        with self.assertRaises(ValueError):
-            doc.write_xhtmlstring(doctype=None)