diff --git a/.gitignore b/.gitignore index 3944698..f7f21cd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ __pycache__/ *.py[cod] +# OS-generated files +.DS_Store + # C extensions *.so @@ -61,4 +64,4 @@ target/ .project #Pycharm -.idea \ No newline at end of file +.idea diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index f860749..ad8fb24 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -4,8 +4,93 @@ @author: Eric Lapouyade """ +from xml.sax.saxutils import escape as xml_escape + +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml import OxmlElement, parse_xml from docx.oxml.ns import qn +from docx.oxml.shape import CT_Inline +from docx.shared import Emu + + +def _get_single_xpath(element, xpath, description): + matches = element.xpath(xpath) + if len(matches) != 1: + raise RuntimeError( + "python-docx generated inline image XML is incompatible with " + "docxtpl's fast inline image template: expected exactly one " + "%s at %s, found %d." % (description, xpath, len(matches)) + ) + return matches[0] + + +def _build_inline_image_xml_template(): + """Generate the XML format string by calling python-docx once. + + This ensures the template always matches the installed python-docx version's + XML structure, even after upgrades. We create one inline image element with + valid values, then replace the exact XML attributes with Python format + placeholders before serializing it. + """ + inline = CT_Inline.new_pic_inline( + 1, + "rId", + "filename", + Emu(1), + Emu(1), + ) + + extent = _get_single_xpath(inline, "./wp:extent", "drawing extent") + doc_pr = _get_single_xpath(inline, "./wp:docPr", "drawing properties") + c_nv_pr = _get_single_xpath(inline, ".//pic:cNvPr", "picture properties") + blip = _get_single_xpath(inline, ".//a:blip", "image relationship") + shape_extent = _get_single_xpath(inline, ".//a:ext", "picture extent") + + extent.set("cx", "{cx}") + extent.set("cy", "{cy}") + doc_pr.set("id", "{shape_id}") + doc_pr.set("name", "Picture {shape_id}") + c_nv_pr.set("name", "{filename}") + blip.set(qn("r:embed"), "{rId}") + shape_extent.set("cx", "{cx}") + shape_extent.set("cy", "{cy}") + + return inline.xml + + +# Pre-built XML template for inline images, derived from the installed +# python-docx version. Using str.format() on this template avoids calling +# CT_Inline.new_pic_inline() per image (which does 2x parse_xml() + +# element manipulation + .xml serialization each time). +_INLINE_IMAGE_XML = None + + +def _get_inline_image_xml_template(): + global _INLINE_IMAGE_XML + if _INLINE_IMAGE_XML is None: + _INLINE_IMAGE_XML = _build_inline_image_xml_template() + return _INLINE_IMAGE_XML + + +def _format_inline_image_xml(shape_id, rId, filename, cx, cy): + try: + template = _get_inline_image_xml_template() + except RuntimeError: + return CT_Inline.new_pic_inline( + shape_id, + rId, + filename or "", + Emu(int(cx)), + Emu(int(cy)), + ).xml + + return template.format( + cx=int(cx), + cy=int(cy), + shape_id=shape_id, + filename=xml_escape(filename or "", {'"': """}), + rId=rId, + ) class InlineImage(object): @@ -50,16 +135,54 @@ def _add_hyperlink(self, run, url, part): return run def _insert_image(self): - pic = self.tpl.current_rendering_part.new_pic_inline( - self.image_descriptor, - self.width, - self.height, - ).xml + part = self.tpl.current_rendering_part + image_descriptor = self.image_descriptor + + # Cache the expensive parts (image part lookup, rId, dimensions) per + # (part, descriptor, width, height). The XML string itself is NOT + # cached because each insertion needs a unique shape_id - header/footer + # and footnote parts are not renumbered by fix_docpr_ids(). + cache = self.tpl._image_cache + # For hashable, value-stable descriptors (strings, paths), cache by + # value. File-like objects are mutable even when hashable (BytesIO, + # open file handles), so never cache their image metadata. + try: + if hasattr(image_descriptor, "read"): + raise TypeError + cache_key = (id(part), image_descriptor, self.width, self.height) + hash(cache_key) is not None # trigger TypeError if unhashable + except TypeError: + cache_key = None + + if cache_key is not None and cache_key in cache: + rId, cx, cy, filename = cache[cache_key] + else: + # Get or add the image part with O(1) descriptor-based dedup, + # avoiding the O(n) linear scan in python-docx's default path. + image_part, image = self.tpl._get_or_add_image_part(image_descriptor) + rId = part.relate_to(image_part, RT.IMAGE) + cx, cy = image.scaled_dimensions(self.width, self.height) + # image.filename is None for file-like descriptors (BytesIO); + # normalize to empty string to match python-docx's behavior. + filename = image.filename or "" + if cache_key is not None: + cache[cache_key] = (rId, int(cx), int(cy), filename) + + # Always assign a fresh shape_id per insertion so that drawing IDs + # are unique in every part (including headers/footers/footnotes + # which are not renumbered by fix_docpr_ids()). + self.tpl.docx_ids_index += 1 + shape_id = self.tpl.docx_ids_index + + # Generate XML from the fast template when compatible, with a native + # python-docx fallback if its generated XML shape ever changes. + pic = _format_inline_image_xml(shape_id, rId, filename, cx, cy) + if self.anchor: run = parse_xml(pic) if run.xpath(".//a:blip"): hyperlink = self._add_hyperlink( - run, self.anchor, self.tpl.current_rendering_part + run, self.anchor, part ) pic = hyperlink.xml diff --git a/docxtpl/richtext.py b/docxtpl/richtext.py index f0f4738..c79ac1e 100644 --- a/docxtpl/richtext.py +++ b/docxtpl/richtext.py @@ -62,6 +62,14 @@ def add( if style: prop += '' % style + if font: + regional_font = "" + if ":" in font: + region, font = font.split(":", 1) + regional_font = ' w:{region}="{font}"'.format(font=font, region=region) + prop += ''.format( + font=font, regional_font=regional_font + ) if color: if color[0] == "#": color = color[1:] @@ -100,14 +108,6 @@ def add( prop += '' % underline if strike: prop += "" - if font: - regional_font = "" - if ":" in font: - region, font = font.split(":", 1) - regional_font = ' w:{region}="{font}"'.format(font=font, region=region) - prop += ''.format( - font=font, regional_font=regional_font - ) if rtl: prop += '' if lang: diff --git a/docxtpl/template.py b/docxtpl/template.py index f20280a..73d8bd0 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -8,17 +8,53 @@ from os import PathLike from typing import TYPE_CHECKING, Any, Optional, IO, Union, Dict, Set -import functools import io from lxml import etree from docx import Document from docx.opc.oxml import parse_xml from docx.opc.part import XmlPart import docx.oxml.ns +from docx.oxml import OxmlElement +from docx.oxml.ns import qn from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE -from jinja2 import Environment, Template, meta +from jinja2 import Environment, meta from jinja2.exceptions import TemplateError + +def _create_optimized_env(**kwargs): + """Create an optimized Jinja2 environment for better performance. + + Optimizations applied: + - auto_reload=False: Skip checking if template source changed + - cache_size=400: Larger template cache for repeated renders + - enable_async=False: Disable async support (not needed, adds overhead) + """ + return Environment( + auto_reload=False, # Disable template auto-reload (faster) + cache_size=400, # Increase template cache size + enable_async=False, # Disable async (not needed, reduces overhead) + **kwargs + ) + + +# Module-level cached environments (created once, reused across all instances) +_CACHED_ENV = None +_CACHED_ENV_AUTOESCAPE = None + + +def _get_cached_env(autoescape=False): + """Get or create a cached Jinja2 environment for performance.""" + global _CACHED_ENV, _CACHED_ENV_AUTOESCAPE + + if autoescape: + if _CACHED_ENV_AUTOESCAPE is None: + _CACHED_ENV_AUTOESCAPE = _create_optimized_env(autoescape=True) + return _CACHED_ENV_AUTOESCAPE + else: + if _CACHED_ENV is None: + _CACHED_ENV = _create_optimized_env(autoescape=False) + return _CACHED_ENV + try: from html import escape # noqa: F401 except ImportError: @@ -43,6 +79,79 @@ class DocxTemplate(object): "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" ) + # Pre-compiled regex patterns for patch_xml() optimization + # These are compiled once at class load time, not on every render + _RE_JINJA_OPEN = re.compile( + r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}#])(<[^>]*>)+(?=\})", + re.DOTALL + ) + _RE_JINJA_CONTENT = re.compile( + r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", + re.DOTALL + ) + _RE_COLSPAN = re.compile( + r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)", + re.DOTALL + ) + _RE_CELLBG = re.compile( + r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)", + re.DOTALL + ) + _RE_SPACE_PRESERVE = re.compile( + r"((?:(?!).)*)({{.*?}}|{%.*?%})", + re.DOTALL + ) + _RE_SPACE_PRESERVE_R = re.compile( + r"({{r\s.*?}}|{%r\s.*?%})", + re.DOTALL + ) + _RE_MERGE_PREV = re.compile(r"(?:(?!).)*?{%-", re.DOTALL) + _RE_MERGE_NEXT = re.compile(r"-%}(?:(?!]|{%|{{).)*?]*?>", re.DOTALL) + _RE_VMERGE = re.compile( + r"](?:(?!]).)*?{%\s*vm\s*%}.*?]", + re.DOTALL + ) + _RE_HMERGE = re.compile( + r"](?:(?!]).)*?{%\s*hm\s*%}.*?]", + re.DOTALL + ) + _RE_CLEAN_TAGS = re.compile(r"(?<=\{[\{%])(.*?)(?=[\}%]})") + _RE_PARAGRAPH_NEWLINE = re.compile(r"])") + _RE_PARAGRAPH_REMOVE_NEWLINE = re.compile(r"\n])") + _RE_STRIPTAGS = re.compile(r".*?(|]*>)", re.DOTALL) + _RE_COLSPAN_EMPTY = re.compile(r"](?:(?!]).)*.*?", re.DOTALL) + _RE_GRIDSPAN = re.compile(r"") + _RE_TCPR = re.compile(r"(]*>)") + _RE_SHD = re.compile(r"") + _RE_RESOLVE_PARAGRAPH = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RESOLVE_RUN = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RESOLVE_TEXT = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RUN_PROPS = re.compile(r".*?") + _RE_PARA_PROPS = re.compile(r".*?") + + # Pre-compiled patterns for tag-stripping in patch_xml(). + # Strips surrounding tags from {%y ...%} / {{y ...}} template tags. + _RE_TAG_STRIP = tuple( + re.compile( + r"](?:(?!]).)*({%%|{{)%s ([^}%%]*(?:%%}|}})).*?" + % (y, y, y, y), + re.DOTALL, + ) + for y in ("tr", "tc", "p", "r") + ) + # Same for {#y ...#} comment tags (not 'r' - comments in runs are uncommon). + _RE_COMMENT_STRIP = tuple( + re.compile( + r"](?:(?!]).)*({#)%s ([^}#]*(?:#})).*?" + % (y, y, y, y), + re.DOTALL, + ) + for y in ("tr", "tc", "p") + ) + + # Cached delimiter patterns for fast header/footer Jinja detection. + _JINJA_START_PATTERNS = {} + def __init__(self, template_file: Union[IO[bytes], str, PathLike]) -> None: self.template_file = template_file self.reset_replacements() @@ -60,8 +169,113 @@ def render_init(self): self.init_docx() self.pic_map = {} self.current_rendering_part = None - self.docx_ids_index = 1000 + self._image_cache = {} self.is_saved = False + self._init_image_parts_index() + self._init_docx_ids_index() + + def _init_docx_ids_index(self): + """Set docx_ids_index above the maximum existing wp:docPr id. + + fix_docpr_ids() only renumbers the body tree, so IDs in headers, + footers, and footnotes retain their original values. Starting the + counter above the global maximum prevents collisions when inserting + new drawings into any part. + """ + import docx.oxml.ns as _ns + wp_ns = _ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + max_id = 0 + + # Scan all parts (body + headers + footers + footnotes) + for part in self.docx._part._package.parts: + if not hasattr(part, 'blob') or part.blob is None: + continue + # Only scan XML parts that could contain drawings + ct = getattr(part, 'content_type', '') + if not ct.startswith('application/vnd.openxmlformats-officedocument'): + continue + try: + tree = etree.fromstring(part.blob) + except Exception: + continue + for elt in tree.iter(tag): + id_val = elt.get('id') + if id_val is not None: + try: + val = int(id_val) + if val > max_id: + max_id = val + except ValueError: + pass + + # Start above the highest existing ID (minimum 1000 for safety) + self.docx_ids_index = max(max_id, 1000) + + def _init_image_parts_index(self): + """Initialize image-part tracking for fast insertion. + + Uses a descriptor-keyed cache (file path string) for O(1) dedup of + images added during rendering, avoiding expensive content hashing. + """ + package = self.docx._part._package + image_parts = package.image_parts + + # Descriptor-keyed cache: maps image_descriptor -> (image_part, image) + # This is the primary dedup mechanism and avoids expensive content hashing. + self._image_descriptor_index = {} + + # Derive the next partname index by scanning existing partnames once. + # Using len() alone would collide with non-contiguous numbering + # (e.g. image1.png + image3.png → len=2 → next would be image3.ext). + max_index = 0 + for ip in image_parts: + # Partnames follow /word/media/imageN.ext pattern + name = str(ip.partname) + m = re.search(r'/image(\d+)\.', name) + if m: + idx = int(m.group(1)) + if idx > max_index: + max_index = idx + self._image_part_counter = max_index + + def _get_or_add_image_part(self, image_descriptor): + """Return (image_part, image) for the given image_descriptor. + + Uses the descriptor itself (file path) as the dedup key, avoiding + expensive content hashing. Falls back to always creating a new part + for non-hashable descriptors (file-like objects). + """ + from docx.image.image import Image + from docx.opc.packuri import PackURI + from docx.parts.image import ImagePart + + # For string paths, use the path as a cheap dedup key. + cache_key = image_descriptor if isinstance(image_descriptor, str) else None + + if cache_key is not None: + cached = self._image_descriptor_index.get(cache_key) + if cached is not None: + return cached + + image = Image.from_file(image_descriptor) + + # Create image part with sequential partname + self._image_part_counter += 1 + partname = PackURI( + "/word/media/image%d.%s" % (self._image_part_counter, image.ext) + ) + image_part = ImagePart.from_image(image, partname) + + # Add to the package collection + package = self.docx._part._package + package.image_parts.append(image_part) + + result = (image_part, image) + if cache_key is not None: + self._image_descriptor_index[cache_key] = result + + return result def __getattr__(self, name): return getattr(self.docx, name) @@ -88,114 +302,64 @@ def patch_xml(self, src_xml): unescape html entities, etc...""" # replace {{ by {{ ( works with {{ }} {% and %} {# and #}) - src_xml = re.sub( - r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}\#])(<[^>]*>)+(?=\})", - "", - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_JINJA_OPEN.sub("", src_xml) # replace {{jinja2 stuff}} by {{jinja2 stuff}} # same thing with {% ... %} and {# #} # "jinja2 stuff" could a variable, a 'if' etc... anything jinja2 will understand def striptags(m): - return re.sub( - ".*?(|]*>)", "", m.group(0), flags=re.DOTALL - ) + return self._RE_STRIPTAGS.sub("", m.group(0)) - src_xml = re.sub( - r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", - striptags, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_JINJA_CONTENT.sub(striptags, src_xml) # manage table cell colspan def colspan(m): cell_xml = m.group(1) + m.group(3) - cell_xml = re.sub( - r"](?:(?!]).)*.*?", - "", - cell_xml, - flags=re.DOTALL, - ) - cell_xml = re.sub(r"", "", cell_xml, count=1) - return re.sub( - r"(]*>)", + cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml) + cell_xml = self._RE_GRIDSPAN.sub("", cell_xml, count=1) + return self._RE_TCPR.sub( r'\1' % m.group(2), cell_xml, ) - src_xml = re.sub( - r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)", - colspan, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_COLSPAN.sub(colspan, src_xml) # manage table cell background color def cellbg(m): cell_xml = m.group(1) + m.group(3) - cell_xml = re.sub( - r"](?:(?!]).)*.*?", - "", - cell_xml, - flags=re.DOTALL, - ) - cell_xml = re.sub(r"", "", cell_xml, count=1) - return re.sub( - r"(]*>)", + cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml) + cell_xml = self._RE_SHD.sub("", cell_xml, count=1) + return self._RE_TCPR.sub( r'\1' % m.group(2), cell_xml, ) - src_xml = re.sub( - r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)", - cellbg, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_CELLBG.sub(cellbg, src_xml) # ensure space preservation - src_xml = re.sub( - r"((?:(?!).)*)({{.*?}}|{%.*?%})", + src_xml = self._RE_SPACE_PRESERVE.sub( r'\1\2', src_xml, - flags=re.DOTALL, ) - src_xml = re.sub( - r"({{r\s.*?}}|{%r\s.*?%})", + src_xml = self._RE_SPACE_PRESERVE_R.sub( r'\1', src_xml, - flags=re.DOTALL, ) # {%- will merge with previous paragraph text - src_xml = re.sub(r"(?:(?!).)*?{%-", "{%", src_xml, flags=re.DOTALL) + src_xml = self._RE_MERGE_PREV.sub("{%", src_xml) # -%} will merge with next paragraph text - src_xml = re.sub( - r"-%}(?:(?!]|{%|{{).)*?]*?>", "%}", src_xml, flags=re.DOTALL - ) + src_xml = self._RE_MERGE_NEXT.sub("%}", src_xml) - for y in ["tr", "tc", "p", "r"]: - # replace into xml code the row/paragraph/run containing - # {%y xxx %} or {{y xxx}} template tag - # by {% xxx %} or {{ xx }} without any surrounding tags : - # This is mandatory to have jinja2 generating correct xml code - pat = ( - r"](?:(?!]).)*({%%|{{)%(y)s ([^}%%]*(?:%%}|}})).*?" - % {"y": y} - ) - src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL) - - for y in ["tr", "tc", "p"]: - # same thing, but for {#y xxx #} (but not where y == 'r', since that - # makes less sense to use comments in that context - pat = ( - r"](?:(?!]).)*({#)%(y)s ([^}#]*(?:#})).*?" - % {"y": y} - ) - src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL) + # Strip surrounding tags from {%y ...%} / {{y ...}} template tags. + # This is mandatory for jinja2 to generate correct xml code. + # Patterns are pre-compiled as class attributes to avoid recompilation. + for pat in self._RE_TAG_STRIP: + src_xml = pat.sub(r"\1 \2", src_xml) + + # Same for {#y ...#} comment tags (not 'r' — comments in runs are uncommon). + for pat in self._RE_COMMENT_STRIP: + src_xml = pat.sub(r"\1 \2", src_xml) # add vMerge # use {% vm %} to make this table cell and its copies @@ -220,12 +384,7 @@ def v_merge(m1): flags=re.DOTALL, ) - src_xml = re.sub( - r"](?:(?!]).)*?{%\s*vm\s*%}.*?]", - v_merge_tc, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_VMERGE.sub(v_merge_tc, src_xml) # Use ``{% hm %}`` to make table cell become horizontally merged within # a ``{% for %}``. @@ -279,12 +438,7 @@ def without_gridspan(m2): # Discard every other cell generated in loop. return "{% if loop.first %}" + xml + "{% endif %}" - src_xml = re.sub( - r"](?:(?!]).)*?{%\s*hm\s*%}.*?]", - h_merge_tc, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_HMERGE.sub(h_merge_tc, src_xml) def clean_tags(m): return ( @@ -298,18 +452,17 @@ def clean_tags(m): .replace("’", "'") ) - src_xml = re.sub(r"(?<=\{[\{%])(.*?)(?=[\}%]})", clean_tags, src_xml) + src_xml = self._RE_CLEAN_TAGS.sub(clean_tags, src_xml) return src_xml def render_xml_part(self, src_xml, part, context, jinja_env=None): - src_xml = re.sub(r"])", r"\n])", r" None: if jinja_env is None: - jinja_env = Environment() + jinja_env = _get_cached_env() for section in self.docx.sections: for part in section.part.package.parts: @@ -378,6 +531,10 @@ def render_footnotes( part._blob = xml.encode("utf-8") def resolve_listing(self, xml): + # Early exit: if no Listing special characters are present (common case), + # there's nothing to resolve, skip the work below. + if "\t" not in xml and "\n" not in xml and "\a" not in xml and "\f" not in xml: + return xml def resolve_text(run_properties, paragraph_properties, m): xml = m.group(0).replace( @@ -403,30 +560,24 @@ def resolve_text(run_properties, paragraph_properties, m): return xml def resolve_run(paragraph_properties, m): - run_properties = re.search(r".*?", m.group(0)) + run_properties = self._RE_RUN_PROPS.search(m.group(0)) run_properties = run_properties.group(0) if run_properties else "" - return re.sub( - r"]*)?>.*?", + return self._RE_RESOLVE_TEXT.sub( lambda x: resolve_text(run_properties, paragraph_properties, x), m.group(0), - flags=re.DOTALL, ) def resolve_paragraph(m): - paragraph_properties = re.search(r".*?", m.group(0)) + paragraph_properties = self._RE_PARA_PROPS.search(m.group(0)) paragraph_properties = ( paragraph_properties.group(0) if paragraph_properties else "" ) - return re.sub( - r"]*)?>.*?", + return self._RE_RESOLVE_RUN.sub( lambda x: resolve_run(paragraph_properties, x), m.group(0), - flags=re.DOTALL, ) - xml = re.sub( - r"]*)?>.*?", resolve_paragraph, xml, flags=re.DOTALL - ) + xml = self._RE_RESOLVE_PARAGRAPH.sub(resolve_paragraph, xml) return xml @@ -437,9 +588,57 @@ def build_xml(self, context, jinja_env=None): return xml def map_tree(self, tree): + """Replace the body element with the rendered tree. + + Instead of iterating over all body children to remove/re-append them + one-by-one (O(n) lxml operations, each with internal bookkeeping), + we swap the entire element in the document root using + root.remove() + root.insert(). This is O(1) since the root element + () has only a handful of direct children. + + The body's index is located first so document element order is + preserved (e.g. body before sectPr). + + SAFETY: If the body is not a direct child of root (malformed template) + or if remove/insert raises for any reason, we fall back to the slower + child-by-child copy so rendering is never broken. + """ root = self.docx._element - body = root.body - root.replace(body, tree) + old_body = root.body + + # Find where sits among root's direct children so we can + # re-insert the new tree at the same position. + body_index = None + for i, child in enumerate(root): + if child is old_body: + body_index = i + break + + if body_index is None: + # Malformed template – body is not a direct child of root. + # Fall back to child-by-child replacement on the existing body. + for child in list(old_body): + old_body.remove(child) + for child in list(tree): + old_body.append(child) + return + + try: + # Detach the old body and insert the new tree (which is itself a + # element returned by fix_tables/parse_xml) at the same + # position. This avoids O(n) per-child remove/append calls. + root.remove(old_body) + root.insert(body_index, tree) + except Exception: + # If something went wrong, restore the document to a usable state + # by re-attaching the old body (if it was already detached) and + # falling back to child-by-child copy. + if old_body.getparent() is None: + root.insert(body_index, old_body) + for child in list(old_body): + old_body.remove(child) + for child in list(tree): + old_body.append(child) def get_headers_footers(self, uri): for relKey, val in self.docx._part.rels.items(): @@ -455,13 +654,52 @@ def get_headers_footers_encoding(self, xml): return m.group(1) return "utf-8" + @classmethod + def _get_jinja_start_pattern(cls, delimiter): + pattern = cls._JINJA_START_PATTERNS.get(delimiter) + if pattern is None: + # Word can split a delimiter across XML runs, e.g. "[[" may become + # "[...[". Allow XML tags between delimiter characters. + pattern = re.compile( + r"(<[^>]*>)*".join(re.escape(char) for char in delimiter), + re.DOTALL, + ) + cls._JINJA_START_PATTERNS[delimiter] = pattern + return pattern + + def _has_jinja_tags(self, xml, jinja_env=None): + if jinja_env is None: + jinja_env = _get_cached_env() + + # Use the active environment's delimiters so custom Jinja syntax in + # headers/footers is detected by the same rules used during rendering. + start_strings = ( + jinja_env.block_start_string, + jinja_env.variable_start_string, + jinja_env.comment_start_string, + ) + return any( + # Fast path for intact delimiters; regex path catches delimiters + # fragmented by Word's XML run markup. + start_string in xml + or self._get_jinja_start_pattern(start_string).search(xml) + for start_string in start_strings + ) + def build_headers_footers_xml(self, context, uri, jinja_env=None): for relKey, part in self.get_headers_footers(uri): xml = self.get_part_xml(part) + encoding = self.get_headers_footers_encoding(xml) - xml = self.patch_xml(xml) - xml = self.render_xml_part(xml, part, context, jinja_env) - yield relKey, xml.encode(encoding) + + # Skip rendering if no Jinja tags present. Use the active Jinja + # environment so custom delimiters in headers/footers are honored. + if self._has_jinja_tags(xml, jinja_env): + xml = self.patch_xml(xml) + xml = self.render_xml_part(xml, part, context, jinja_env) + yield relKey, xml.encode(encoding) + else: + yield relKey, xml.encode(encoding) def map_headers_footers_xml(self, relKey, xml): part = self.docx._part.rels[relKey].target_part @@ -479,58 +717,102 @@ def render( # init template working attributes self.render_init() - if autoescape: - if not jinja_env: - jinja_env = Environment(autoescape=autoescape) - else: - jinja_env.autoescape = autoescape + # Use cached environment by default + if not jinja_env: + jinja_env = _get_cached_env(autoescape=autoescape) + elif autoescape: + jinja_env.autoescape = autoescape # Body xml_src = self.build_xml(context, jinja_env) - # fix tables if needed + # Fix tables if needed tree = self.fix_tables(xml_src) - # fix docPr ID's + # Fix docPr ID's self.fix_docpr_ids(tree) # Replace body xml tree self.map_tree(tree) - # Headers - headers = self.build_headers_footers_xml(context, self.HEADER_URI, jinja_env) - for relKey, xml in headers: - self.map_headers_footers_xml(relKey, xml) - - # Footers - footers = self.build_headers_footers_xml(context, self.FOOTER_URI, jinja_env) - for relKey, xml in footers: - self.map_headers_footers_xml(relKey, xml) - + # Headers & Footers - skip when no Jinja tags are present. + for uri in (self.HEADER_URI, self.FOOTER_URI): + try: + has_jinja = any( + self._has_jinja_tags(xml, jinja_env) + for xml in ( + self.get_part_xml(part) + for _relKey, part in self.get_headers_footers(uri) + ) + ) + if has_jinja: + for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): + self.map_headers_footers_xml(relKey, xml) + except Exception: + # Fallback: guards against unexpected part structure (e.g. blob + # is None, missing attributes). Not malformed XML - that would + # fail in build_headers_footers_xml regardless. + for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): + self.map_headers_footers_xml(relKey, xml) + + # Properties: no skip-check needed - these are a handful of short + # strings (author, title, etc.) where from_string() is near-zero cost. self.render_properties(context, jinja_env) + # Footnotes: no skip-check needed - at most one part exists in typical + # documents, and many have none, so the loop body rarely executes. self.render_footnotes(context, jinja_env) # set rendered flag self.is_rendered = True - # using of TC tag in for cycle can cause that count of columns does not - # correspond to real count of columns in row. This function is able to fix it. + # Using of TC tag in for cycle can cause that count of columns does not + # correspond to real count of columns in row. def fix_tables(self, xml): - parser = etree.XMLParser(recover=True) - tree = etree.fromstring(xml, parser=parser) + # Use parse_xml with safe fallback for malformed XML + try: + tree = parse_xml(xml) # parse_xml() is significantly faster + except Exception: + # Fallback to permissive parser in the event of malformed XML + parser = etree.XMLParser(recover=True) + tree = etree.fromstring(xml, parser=parser) # get namespace ns = "{" + tree.nsmap["w"] + "}" # walk trough xml and find table for t in tree.iter(ns + "tbl"): tblGrid = t.find(ns + "tblGrid") + if tblGrid is None: + continue + columns = tblGrid.findall(ns + "gridCol") - to_add = 0 - # walk trough all rows and try to find if there is higher cell count + columns_len = len(columns) + + # Single pass row analysis with both counters + max_raw_cells = 0 # For ADD decision (raw tc count) + max_effective_cells = 0 # For REMOVE decision (with gridSpan) + for r in t.iter(ns + "tr"): cells = r.findall(ns + "tc") - if (len(columns) + to_add) < len(cells): - to_add = len(cells) - len(columns) + raw_count = len(cells) + effective_count = 0 + + for cell in cells: + tc_pr = cell.find(ns + "tcPr") + if tc_pr is not None: + grid_span = tc_pr.find(ns + "gridSpan") + if grid_span is not None: + effective_count += int(grid_span.get(ns + "val")) + continue + effective_count += 1 + + if raw_count > max_raw_cells: + max_raw_cells = raw_count + if effective_count > max_effective_cells: + max_effective_cells = effective_count + + # ADD columns based on RAW cell count (original behavior) + to_add = max_raw_cells - columns_len if max_raw_cells > columns_len else 0 + # is necessary to add columns? if to_add > 0: # at first, calculate width of table according to columns @@ -552,34 +834,16 @@ def fix_tables(self, xml): int(float(c.get(ns + "w")) * new_average / old_average) ), ) - # add new columns + # add new columns using OxmlElement for proper python-docx compatibility for i in range(to_add): - etree.SubElement( - tblGrid, ns + "gridCol", {ns + "w": str(int(new_average))} - ) + new_col = OxmlElement('w:gridCol') + new_col.set(qn('w:w'), str(int(new_average))) + tblGrid.append(new_col) - # Refetch columns after columns addition. + # REMOVE columns based on EFFECTIVE cell count (original behavior) columns = tblGrid.findall(ns + "gridCol") columns_len = len(columns) - - cells_len_max = 0 - - def get_cell_len(total, cell): - tc_pr = cell.find(ns + "tcPr") - grid_span = None if tc_pr is None else tc_pr.find(ns + "gridSpan") - - if grid_span is not None: - return total + int(grid_span.get(ns + "val")) - - return total + 1 - - # Calculate max of table cells to compare with `gridCol`. - for r in t.iter(ns + "tr"): - cells = r.findall(ns + "tc") - cells_len = functools.reduce(get_cell_len, cells, 0) - cells_len_max = max(cells_len_max, cells_len) - - to_remove = columns_len - cells_len_max + to_remove = columns_len - max_effective_cells if columns_len > max_effective_cells else 0 # If after the loop, there're less columns, than # originally was, remove extra `gridCol` declarations. @@ -608,8 +872,11 @@ def get_cell_len(total, cell): return tree def fix_docpr_ids(self, tree): - # some Ids may have some collisions : so renumbering all of them : - for elt in tree.xpath("//wp:docPr", namespaces=docx.oxml.ns.nsmap): + # Some Ids may have some collisions : so renumbering all of them + wp_ns = docx.oxml.ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + + for elt in tree.iter(tag): self.docx_ids_index += 1 elt.attrib["id"] = str(self.docx_ids_index) @@ -913,7 +1180,7 @@ def get_undeclared_template_variables( if jinja_env: env = jinja_env else: - env = Environment() + env = _get_cached_env() parse_content = env.parse(xml) all_variables = meta.find_undeclared_variables(parse_content) diff --git a/pyproject.toml b/pyproject.toml index 476fc0e..28533c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,10 @@ repository = "https://github.com/elapouya/python-docx-template.git" document = "https://docxtpl.readthedocs.org" [tool.poetry] +name = "docxtpl" version = "0.0.0" +description = "Python docx template engine" +authors = ["Eric Lapouyade "] [tool.poetry.requires-plugins] poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] } diff --git a/tests/header_footer_custom_delimiters.py b/tests/header_footer_custom_delimiters.py new file mode 100644 index 0000000..05d3329 --- /dev/null +++ b/tests/header_footer_custom_delimiters.py @@ -0,0 +1,52 @@ +import os +import zipfile + +import jinja2 +from docx import Document + +from docxtpl import DocxTemplate + + +template_path = "output/header_footer_custom_delimiters_tpl.docx" +output_path = "output/header_footer_custom_delimiters.docx" + +os.makedirs("output", exist_ok=True) + +document = Document() +document.add_paragraph("Body text") +section = document.sections[0] +section.header.paragraphs[0].text = "[[ date ]]" +section.footer.paragraphs[0].text = "[[ company_name ]]" +document.save(template_path) + +jinja_env = jinja2.Environment( + variable_start_string="[[", + variable_end_string="]]", +) + +tpl = DocxTemplate(template_path) +tpl.render( + { + "company_name": "The World Wide company", + "date": "2016-03-17", + }, + jinja_env=jinja_env, +) +tpl.save(output_path) + +with zipfile.ZipFile(output_path) as docx_zip: + header_xml = "\n".join( + docx_zip.read(name).decode("utf-8") + for name in docx_zip.namelist() + if name.startswith("word/header") + ) + footer_xml = "\n".join( + docx_zip.read(name).decode("utf-8") + for name in docx_zip.namelist() + if name.startswith("word/footer") + ) + +assert "2016-03-17" in header_xml +assert "The World Wide company" in footer_xml +assert "[[" not in header_xml +assert "[[" not in footer_xml diff --git a/tests/inline_image_file_like_cache.py b/tests/inline_image_file_like_cache.py new file mode 100644 index 0000000..1b6784d --- /dev/null +++ b/tests/inline_image_file_like_cache.py @@ -0,0 +1,30 @@ +import io +import re + +from docxtpl import DocxTemplate, InlineImage + + +def image_bytes(path): + with open(path, "rb") as image_file: + return image_file.read() + + +def embedded_rid(xml): + return re.search(r'r:embed="([^"]+)"', xml).group(1) + + +tpl = DocxTemplate("templates/inline_image_tpl.docx") +tpl.render_init() +tpl.current_rendering_part = tpl.docx._part + +stream = io.BytesIO(image_bytes("templates/django.png")) +first_xml = str(InlineImage(tpl, stream)) + +stream.seek(0) +stream.truncate() +stream.write(image_bytes("templates/python.png")) +stream.seek(0) +second_xml = str(InlineImage(tpl, stream)) + +assert embedded_rid(first_xml) != embedded_rid(second_xml) +assert tpl._image_cache == {} diff --git a/tests/inline_image_xml_template.py b/tests/inline_image_xml_template.py new file mode 100644 index 0000000..b5393ab --- /dev/null +++ b/tests/inline_image_xml_template.py @@ -0,0 +1,45 @@ +from docx.oxml import parse_xml + +import docxtpl.inline_image as inline_image + + +_INLINE_IMAGE_XML = inline_image._get_inline_image_xml_template() + +assert _INLINE_IMAGE_XML.count("{shape_id}") == 2 +assert _INLINE_IMAGE_XML.count("{cx}") == 2 +assert _INLINE_IMAGE_XML.count("{cy}") == 2 +assert _INLINE_IMAGE_XML.count("{rId}") == 1 +assert _INLINE_IMAGE_XML.count("{filename}") == 1 + +parse_xml( + _INLINE_IMAGE_XML.format( + shape_id=1, + cx=2, + cy=3, + rId="rId1", + filename="image.png", + ) +) + + +def raise_incompatible_template(): + raise RuntimeError("incompatible template") + + +original_template = inline_image._INLINE_IMAGE_XML +original_builder = inline_image._build_inline_image_xml_template +try: + inline_image._INLINE_IMAGE_XML = None + inline_image._build_inline_image_xml_template = raise_incompatible_template + fallback_xml = inline_image._format_inline_image_xml( + shape_id=1, + rId="rId1", + filename='quoted " image.png', + cx=2, + cy=3, + ) + parse_xml(fallback_xml) + assert 'name="quoted " image.png"' in fallback_xml +finally: + inline_image._INLINE_IMAGE_XML = original_template + inline_image._build_inline_image_xml_template = original_builder