diff --git a/.gitignore b/.gitignore
index 3944698..f7f21cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
__pycache__/
*.py[cod]
+# OS-generated files
+.DS_Store
+
# C extensions
*.so
@@ -61,4 +64,4 @@ target/
.project
#Pycharm
-.idea
\ No newline at end of file
+.idea
diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py
index f860749..ad8fb24 100644
--- a/docxtpl/inline_image.py
+++ b/docxtpl/inline_image.py
@@ -4,8 +4,93 @@
@author: Eric Lapouyade
"""
+from xml.sax.saxutils import escape as xml_escape
+
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import qn
+from docx.oxml.shape import CT_Inline
+from docx.shared import Emu
+
+
+def _get_single_xpath(element, xpath, description):
+ matches = element.xpath(xpath)
+ if len(matches) != 1:
+ raise RuntimeError(
+ "python-docx generated inline image XML is incompatible with "
+ "docxtpl's fast inline image template: expected exactly one "
+ "%s at %s, found %d." % (description, xpath, len(matches))
+ )
+ return matches[0]
+
+
+def _build_inline_image_xml_template():
+ """Generate the XML format string by calling python-docx once.
+
+ This ensures the template always matches the installed python-docx version's
+ XML structure, even after upgrades. We create one inline image element with
+ valid values, then replace the exact XML attributes with Python format
+ placeholders before serializing it.
+ """
+ inline = CT_Inline.new_pic_inline(
+ 1,
+ "rId",
+ "filename",
+ Emu(1),
+ Emu(1),
+ )
+
+ extent = _get_single_xpath(inline, "./wp:extent", "drawing extent")
+ doc_pr = _get_single_xpath(inline, "./wp:docPr", "drawing properties")
+ c_nv_pr = _get_single_xpath(inline, ".//pic:cNvPr", "picture properties")
+ blip = _get_single_xpath(inline, ".//a:blip", "image relationship")
+ shape_extent = _get_single_xpath(inline, ".//a:ext", "picture extent")
+
+ extent.set("cx", "{cx}")
+ extent.set("cy", "{cy}")
+ doc_pr.set("id", "{shape_id}")
+ doc_pr.set("name", "Picture {shape_id}")
+ c_nv_pr.set("name", "{filename}")
+ blip.set(qn("r:embed"), "{rId}")
+ shape_extent.set("cx", "{cx}")
+ shape_extent.set("cy", "{cy}")
+
+ return inline.xml
+
+
+# Pre-built XML template for inline images, derived from the installed
+# python-docx version. Using str.format() on this template avoids calling
+# CT_Inline.new_pic_inline() per image (which does 2x parse_xml() +
+# element manipulation + .xml serialization each time).
+_INLINE_IMAGE_XML = None
+
+
+def _get_inline_image_xml_template():
+ global _INLINE_IMAGE_XML
+ if _INLINE_IMAGE_XML is None:
+ _INLINE_IMAGE_XML = _build_inline_image_xml_template()
+ return _INLINE_IMAGE_XML
+
+
+def _format_inline_image_xml(shape_id, rId, filename, cx, cy):
+ try:
+ template = _get_inline_image_xml_template()
+ except RuntimeError:
+ return CT_Inline.new_pic_inline(
+ shape_id,
+ rId,
+ filename or "",
+ Emu(int(cx)),
+ Emu(int(cy)),
+ ).xml
+
+ return template.format(
+ cx=int(cx),
+ cy=int(cy),
+ shape_id=shape_id,
+ filename=xml_escape(filename or "", {'"': """}),
+ rId=rId,
+ )
class InlineImage(object):
@@ -50,16 +135,54 @@ def _add_hyperlink(self, run, url, part):
return run
def _insert_image(self):
- pic = self.tpl.current_rendering_part.new_pic_inline(
- self.image_descriptor,
- self.width,
- self.height,
- ).xml
+ part = self.tpl.current_rendering_part
+ image_descriptor = self.image_descriptor
+
+ # Cache the expensive parts (image part lookup, rId, dimensions) per
+ # (part, descriptor, width, height). The XML string itself is NOT
+ # cached because each insertion needs a unique shape_id - header/footer
+ # and footnote parts are not renumbered by fix_docpr_ids().
+ cache = self.tpl._image_cache
+ # For hashable, value-stable descriptors (strings, paths), cache by
+ # value. File-like objects are mutable even when hashable (BytesIO,
+ # open file handles), so never cache their image metadata.
+ try:
+ if hasattr(image_descriptor, "read"):
+ raise TypeError
+ cache_key = (id(part), image_descriptor, self.width, self.height)
+ hash(cache_key) is not None # trigger TypeError if unhashable
+ except TypeError:
+ cache_key = None
+
+ if cache_key is not None and cache_key in cache:
+ rId, cx, cy, filename = cache[cache_key]
+ else:
+ # Get or add the image part with O(1) descriptor-based dedup,
+ # avoiding the O(n) linear scan in python-docx's default path.
+ image_part, image = self.tpl._get_or_add_image_part(image_descriptor)
+ rId = part.relate_to(image_part, RT.IMAGE)
+ cx, cy = image.scaled_dimensions(self.width, self.height)
+ # image.filename is None for file-like descriptors (BytesIO);
+ # normalize to empty string to match python-docx's behavior.
+ filename = image.filename or ""
+ if cache_key is not None:
+ cache[cache_key] = (rId, int(cx), int(cy), filename)
+
+ # Always assign a fresh shape_id per insertion so that drawing IDs
+ # are unique in every part (including headers/footers/footnotes
+ # which are not renumbered by fix_docpr_ids()).
+ self.tpl.docx_ids_index += 1
+ shape_id = self.tpl.docx_ids_index
+
+ # Generate XML from the fast template when compatible, with a native
+ # python-docx fallback if its generated XML shape ever changes.
+ pic = _format_inline_image_xml(shape_id, rId, filename, cx, cy)
+
if self.anchor:
run = parse_xml(pic)
if run.xpath(".//a:blip"):
hyperlink = self._add_hyperlink(
- run, self.anchor, self.tpl.current_rendering_part
+ run, self.anchor, part
)
pic = hyperlink.xml
diff --git a/docxtpl/richtext.py b/docxtpl/richtext.py
index f0f4738..c79ac1e 100644
--- a/docxtpl/richtext.py
+++ b/docxtpl/richtext.py
@@ -62,6 +62,14 @@ def add(
if style:
prop += '' % style
+ if font:
+ regional_font = ""
+ if ":" in font:
+ region, font = font.split(":", 1)
+ regional_font = ' w:{region}="{font}"'.format(font=font, region=region)
+ prop += ''.format(
+ font=font, regional_font=regional_font
+ )
if color:
if color[0] == "#":
color = color[1:]
@@ -100,14 +108,6 @@ def add(
prop += '' % underline
if strike:
prop += ""
- if font:
- regional_font = ""
- if ":" in font:
- region, font = font.split(":", 1)
- regional_font = ' w:{region}="{font}"'.format(font=font, region=region)
- prop += ''.format(
- font=font, regional_font=regional_font
- )
if rtl:
prop += ''
if lang:
diff --git a/docxtpl/template.py b/docxtpl/template.py
index f20280a..73d8bd0 100644
--- a/docxtpl/template.py
+++ b/docxtpl/template.py
@@ -8,17 +8,53 @@
from os import PathLike
from typing import TYPE_CHECKING, Any, Optional, IO, Union, Dict, Set
-import functools
import io
from lxml import etree
from docx import Document
from docx.opc.oxml import parse_xml
from docx.opc.part import XmlPart
import docx.oxml.ns
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE
-from jinja2 import Environment, Template, meta
+from jinja2 import Environment, meta
from jinja2.exceptions import TemplateError
+
+def _create_optimized_env(**kwargs):
+ """Create an optimized Jinja2 environment for better performance.
+
+ Optimizations applied:
+ - auto_reload=False: Skip checking if template source changed
+ - cache_size=400: Larger template cache for repeated renders
+ - enable_async=False: Disable async support (not needed, adds overhead)
+ """
+ return Environment(
+ auto_reload=False, # Disable template auto-reload (faster)
+ cache_size=400, # Increase template cache size
+ enable_async=False, # Disable async (not needed, reduces overhead)
+ **kwargs
+ )
+
+
+# Module-level cached environments (created once, reused across all instances)
+_CACHED_ENV = None
+_CACHED_ENV_AUTOESCAPE = None
+
+
+def _get_cached_env(autoescape=False):
+ """Get or create a cached Jinja2 environment for performance."""
+ global _CACHED_ENV, _CACHED_ENV_AUTOESCAPE
+
+ if autoescape:
+ if _CACHED_ENV_AUTOESCAPE is None:
+ _CACHED_ENV_AUTOESCAPE = _create_optimized_env(autoescape=True)
+ return _CACHED_ENV_AUTOESCAPE
+ else:
+ if _CACHED_ENV is None:
+ _CACHED_ENV = _create_optimized_env(autoescape=False)
+ return _CACHED_ENV
+
try:
from html import escape # noqa: F401
except ImportError:
@@ -43,6 +79,79 @@ class DocxTemplate(object):
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer"
)
+ # Pre-compiled regex patterns for patch_xml() optimization
+ # These are compiled once at class load time, not on every render
+ _RE_JINJA_OPEN = re.compile(
+ r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}#])(<[^>]*>)+(?=\})",
+ re.DOTALL
+ )
+ _RE_JINJA_CONTENT = re.compile(
+ r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*",
+ re.DOTALL
+ )
+ _RE_COLSPAN = re.compile(
+ r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)",
+ re.DOTALL
+ )
+ _RE_CELLBG = re.compile(
+ r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)",
+ re.DOTALL
+ )
+ _RE_SPACE_PRESERVE = re.compile(
+ r"((?:(?!).)*)({{.*?}}|{%.*?%})",
+ re.DOTALL
+ )
+ _RE_SPACE_PRESERVE_R = re.compile(
+ r"({{r\s.*?}}|{%r\s.*?%})",
+ re.DOTALL
+ )
+ _RE_MERGE_PREV = re.compile(r"(?:(?!).)*?{%-", re.DOTALL)
+ _RE_MERGE_NEXT = re.compile(r"-%}(?:(?!]|{%|{{).)*?]*?>", re.DOTALL)
+ _RE_VMERGE = re.compile(
+ r"](?:(?!]).)*?{%\s*vm\s*%}.*?]",
+ re.DOTALL
+ )
+ _RE_HMERGE = re.compile(
+ r"](?:(?!]).)*?{%\s*hm\s*%}.*?]",
+ re.DOTALL
+ )
+ _RE_CLEAN_TAGS = re.compile(r"(?<=\{[\{%])(.*?)(?=[\}%]})")
+ _RE_PARAGRAPH_NEWLINE = re.compile(r"])")
+ _RE_PARAGRAPH_REMOVE_NEWLINE = re.compile(r"\n])")
+ _RE_STRIPTAGS = re.compile(r".*?(|]*>)", re.DOTALL)
+ _RE_COLSPAN_EMPTY = re.compile(r"](?:(?!]).)*.*?", re.DOTALL)
+ _RE_GRIDSPAN = re.compile(r"")
+ _RE_TCPR = re.compile(r"(]*>)")
+ _RE_SHD = re.compile(r"")
+ _RE_RESOLVE_PARAGRAPH = re.compile(r"]*)?>.*?", re.DOTALL)
+ _RE_RESOLVE_RUN = re.compile(r"]*)?>.*?", re.DOTALL)
+ _RE_RESOLVE_TEXT = re.compile(r"]*)?>.*?", re.DOTALL)
+ _RE_RUN_PROPS = re.compile(r".*?")
+ _RE_PARA_PROPS = re.compile(r".*?")
+
+ # Pre-compiled patterns for tag-stripping in patch_xml().
+ # Strips surrounding tags from {%y ...%} / {{y ...}} template tags.
+ _RE_TAG_STRIP = tuple(
+ re.compile(
+ r"](?:(?!]).)*({%%|{{)%s ([^}%%]*(?:%%}|}})).*?"
+ % (y, y, y, y),
+ re.DOTALL,
+ )
+ for y in ("tr", "tc", "p", "r")
+ )
+ # Same for {#y ...#} comment tags (not 'r' - comments in runs are uncommon).
+ _RE_COMMENT_STRIP = tuple(
+ re.compile(
+ r"](?:(?!]).)*({#)%s ([^}#]*(?:#})).*?"
+ % (y, y, y, y),
+ re.DOTALL,
+ )
+ for y in ("tr", "tc", "p")
+ )
+
+ # Cached delimiter patterns for fast header/footer Jinja detection.
+ _JINJA_START_PATTERNS = {}
+
def __init__(self, template_file: Union[IO[bytes], str, PathLike]) -> None:
self.template_file = template_file
self.reset_replacements()
@@ -60,8 +169,113 @@ def render_init(self):
self.init_docx()
self.pic_map = {}
self.current_rendering_part = None
- self.docx_ids_index = 1000
+ self._image_cache = {}
self.is_saved = False
+ self._init_image_parts_index()
+ self._init_docx_ids_index()
+
+ def _init_docx_ids_index(self):
+ """Set docx_ids_index above the maximum existing wp:docPr id.
+
+ fix_docpr_ids() only renumbers the body tree, so IDs in headers,
+ footers, and footnotes retain their original values. Starting the
+ counter above the global maximum prevents collisions when inserting
+ new drawings into any part.
+ """
+ import docx.oxml.ns as _ns
+ wp_ns = _ns.nsmap['wp']
+ tag = "{%s}docPr" % wp_ns
+ max_id = 0
+
+ # Scan all parts (body + headers + footers + footnotes)
+ for part in self.docx._part._package.parts:
+ if not hasattr(part, 'blob') or part.blob is None:
+ continue
+ # Only scan XML parts that could contain drawings
+ ct = getattr(part, 'content_type', '')
+ if not ct.startswith('application/vnd.openxmlformats-officedocument'):
+ continue
+ try:
+ tree = etree.fromstring(part.blob)
+ except Exception:
+ continue
+ for elt in tree.iter(tag):
+ id_val = elt.get('id')
+ if id_val is not None:
+ try:
+ val = int(id_val)
+ if val > max_id:
+ max_id = val
+ except ValueError:
+ pass
+
+ # Start above the highest existing ID (minimum 1000 for safety)
+ self.docx_ids_index = max(max_id, 1000)
+
+ def _init_image_parts_index(self):
+ """Initialize image-part tracking for fast insertion.
+
+ Uses a descriptor-keyed cache (file path string) for O(1) dedup of
+ images added during rendering, avoiding expensive content hashing.
+ """
+ package = self.docx._part._package
+ image_parts = package.image_parts
+
+ # Descriptor-keyed cache: maps image_descriptor -> (image_part, image)
+ # This is the primary dedup mechanism and avoids expensive content hashing.
+ self._image_descriptor_index = {}
+
+ # Derive the next partname index by scanning existing partnames once.
+ # Using len() alone would collide with non-contiguous numbering
+ # (e.g. image1.png + image3.png → len=2 → next would be image3.ext).
+ max_index = 0
+ for ip in image_parts:
+ # Partnames follow /word/media/imageN.ext pattern
+ name = str(ip.partname)
+ m = re.search(r'/image(\d+)\.', name)
+ if m:
+ idx = int(m.group(1))
+ if idx > max_index:
+ max_index = idx
+ self._image_part_counter = max_index
+
+ def _get_or_add_image_part(self, image_descriptor):
+ """Return (image_part, image) for the given image_descriptor.
+
+ Uses the descriptor itself (file path) as the dedup key, avoiding
+ expensive content hashing. Falls back to always creating a new part
+ for non-hashable descriptors (file-like objects).
+ """
+ from docx.image.image import Image
+ from docx.opc.packuri import PackURI
+ from docx.parts.image import ImagePart
+
+ # For string paths, use the path as a cheap dedup key.
+ cache_key = image_descriptor if isinstance(image_descriptor, str) else None
+
+ if cache_key is not None:
+ cached = self._image_descriptor_index.get(cache_key)
+ if cached is not None:
+ return cached
+
+ image = Image.from_file(image_descriptor)
+
+ # Create image part with sequential partname
+ self._image_part_counter += 1
+ partname = PackURI(
+ "/word/media/image%d.%s" % (self._image_part_counter, image.ext)
+ )
+ image_part = ImagePart.from_image(image, partname)
+
+ # Add to the package collection
+ package = self.docx._part._package
+ package.image_parts.append(image_part)
+
+ result = (image_part, image)
+ if cache_key is not None:
+ self._image_descriptor_index[cache_key] = result
+
+ return result
def __getattr__(self, name):
return getattr(self.docx, name)
@@ -88,114 +302,64 @@ def patch_xml(self, src_xml):
unescape html entities, etc..."""
# replace {{ by {{ ( works with {{ }} {% and %} {# and #})
- src_xml = re.sub(
- r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}\#])(<[^>]*>)+(?=\})",
- "",
- src_xml,
- flags=re.DOTALL,
- )
+ src_xml = self._RE_JINJA_OPEN.sub("", src_xml)
# replace {{jinja2 stuff}} by {{jinja2 stuff}}
# same thing with {% ... %} and {# #}
# "jinja2 stuff" could a variable, a 'if' etc... anything jinja2 will understand
def striptags(m):
- return re.sub(
- ".*?(|]*>)", "", m.group(0), flags=re.DOTALL
- )
+ return self._RE_STRIPTAGS.sub("", m.group(0))
- src_xml = re.sub(
- r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*",
- striptags,
- src_xml,
- flags=re.DOTALL,
- )
+ src_xml = self._RE_JINJA_CONTENT.sub(striptags, src_xml)
# manage table cell colspan
def colspan(m):
cell_xml = m.group(1) + m.group(3)
- cell_xml = re.sub(
- r"](?:(?!]).)*.*?",
- "",
- cell_xml,
- flags=re.DOTALL,
- )
- cell_xml = re.sub(r"", "", cell_xml, count=1)
- return re.sub(
- r"(]*>)",
+ cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml)
+ cell_xml = self._RE_GRIDSPAN.sub("", cell_xml, count=1)
+ return self._RE_TCPR.sub(
r'\1' % m.group(2),
cell_xml,
)
- src_xml = re.sub(
- r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)",
- colspan,
- src_xml,
- flags=re.DOTALL,
- )
+ src_xml = self._RE_COLSPAN.sub(colspan, src_xml)
# manage table cell background color
def cellbg(m):
cell_xml = m.group(1) + m.group(3)
- cell_xml = re.sub(
- r"](?:(?!]).)*.*?",
- "",
- cell_xml,
- flags=re.DOTALL,
- )
- cell_xml = re.sub(r"", "", cell_xml, count=1)
- return re.sub(
- r"(]*>)",
+ cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml)
+ cell_xml = self._RE_SHD.sub("", cell_xml, count=1)
+ return self._RE_TCPR.sub(
r'\1' % m.group(2),
cell_xml,
)
- src_xml = re.sub(
- r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)",
- cellbg,
- src_xml,
- flags=re.DOTALL,
- )
+ src_xml = self._RE_CELLBG.sub(cellbg, src_xml)
# ensure space preservation
- src_xml = re.sub(
- r"((?:(?!).)*)({{.*?}}|{%.*?%})",
+ src_xml = self._RE_SPACE_PRESERVE.sub(
r'\1\2',
src_xml,
- flags=re.DOTALL,
)
- src_xml = re.sub(
- r"({{r\s.*?}}|{%r\s.*?%})",
+ src_xml = self._RE_SPACE_PRESERVE_R.sub(
r'\1',
src_xml,
- flags=re.DOTALL,
)
# {%- will merge with previous paragraph text
- src_xml = re.sub(r"(?:(?!).)*?{%-", "{%", src_xml, flags=re.DOTALL)
+ src_xml = self._RE_MERGE_PREV.sub("{%", src_xml)
# -%} will merge with next paragraph text
- src_xml = re.sub(
- r"-%}(?:(?!]|{%|{{).)*?]*?>", "%}", src_xml, flags=re.DOTALL
- )
+ src_xml = self._RE_MERGE_NEXT.sub("%}", src_xml)
- for y in ["tr", "tc", "p", "r"]:
- # replace into xml code the row/paragraph/run containing
- # {%y xxx %} or {{y xxx}} template tag
- # by {% xxx %} or {{ xx }} without any surrounding tags :
- # This is mandatory to have jinja2 generating correct xml code
- pat = (
- r"](?:(?!]).)*({%%|{{)%(y)s ([^}%%]*(?:%%}|}})).*?"
- % {"y": y}
- )
- src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL)
-
- for y in ["tr", "tc", "p"]:
- # same thing, but for {#y xxx #} (but not where y == 'r', since that
- # makes less sense to use comments in that context
- pat = (
- r"](?:(?!]).)*({#)%(y)s ([^}#]*(?:#})).*?"
- % {"y": y}
- )
- src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL)
+ # Strip surrounding tags from {%y ...%} / {{y ...}} template tags.
+ # This is mandatory for jinja2 to generate correct xml code.
+ # Patterns are pre-compiled as class attributes to avoid recompilation.
+ for pat in self._RE_TAG_STRIP:
+ src_xml = pat.sub(r"\1 \2", src_xml)
+
+ # Same for {#y ...#} comment tags (not 'r' — comments in runs are uncommon).
+ for pat in self._RE_COMMENT_STRIP:
+ src_xml = pat.sub(r"\1 \2", src_xml)
# add vMerge
# use {% vm %} to make this table cell and its copies
@@ -220,12 +384,7 @@ def v_merge(m1):
flags=re.DOTALL,
)
- src_xml = re.sub(
- r"](?:(?!]).)*?{%\s*vm\s*%}.*?]",
- v_merge_tc,
- src_xml,
- flags=re.DOTALL,
- )
+ src_xml = self._RE_VMERGE.sub(v_merge_tc, src_xml)
# Use ``{% hm %}`` to make table cell become horizontally merged within
# a ``{% for %}``.
@@ -279,12 +438,7 @@ def without_gridspan(m2):
# Discard every other cell generated in loop.
return "{% if loop.first %}" + xml + "{% endif %}"
- src_xml = re.sub(
- r"](?:(?!]).)*?{%\s*hm\s*%}.*?]",
- h_merge_tc,
- src_xml,
- flags=re.DOTALL,
- )
+ src_xml = self._RE_HMERGE.sub(h_merge_tc, src_xml)
def clean_tags(m):
return (
@@ -298,18 +452,17 @@ def clean_tags(m):
.replace("’", "'")
)
- src_xml = re.sub(r"(?<=\{[\{%])(.*?)(?=[\}%]})", clean_tags, src_xml)
+ src_xml = self._RE_CLEAN_TAGS.sub(clean_tags, src_xml)
return src_xml
def render_xml_part(self, src_xml, part, context, jinja_env=None):
- src_xml = re.sub(r"])", r"\n])", r" None:
if jinja_env is None:
- jinja_env = Environment()
+ jinja_env = _get_cached_env()
for section in self.docx.sections:
for part in section.part.package.parts:
@@ -378,6 +531,10 @@ def render_footnotes(
part._blob = xml.encode("utf-8")
def resolve_listing(self, xml):
+ # Early exit: if no Listing special characters are present (common case),
+ # there's nothing to resolve, skip the work below.
+ if "\t" not in xml and "\n" not in xml and "\a" not in xml and "\f" not in xml:
+ return xml
def resolve_text(run_properties, paragraph_properties, m):
xml = m.group(0).replace(
@@ -403,30 +560,24 @@ def resolve_text(run_properties, paragraph_properties, m):
return xml
def resolve_run(paragraph_properties, m):
- run_properties = re.search(r".*?", m.group(0))
+ run_properties = self._RE_RUN_PROPS.search(m.group(0))
run_properties = run_properties.group(0) if run_properties else ""
- return re.sub(
- r"]*)?>.*?",
+ return self._RE_RESOLVE_TEXT.sub(
lambda x: resolve_text(run_properties, paragraph_properties, x),
m.group(0),
- flags=re.DOTALL,
)
def resolve_paragraph(m):
- paragraph_properties = re.search(r".*?", m.group(0))
+ paragraph_properties = self._RE_PARA_PROPS.search(m.group(0))
paragraph_properties = (
paragraph_properties.group(0) if paragraph_properties else ""
)
- return re.sub(
- r"]*)?>.*?",
+ return self._RE_RESOLVE_RUN.sub(
lambda x: resolve_run(paragraph_properties, x),
m.group(0),
- flags=re.DOTALL,
)
- xml = re.sub(
- r"]*)?>.*?", resolve_paragraph, xml, flags=re.DOTALL
- )
+ xml = self._RE_RESOLVE_PARAGRAPH.sub(resolve_paragraph, xml)
return xml
@@ -437,9 +588,57 @@ def build_xml(self, context, jinja_env=None):
return xml
def map_tree(self, tree):
+ """Replace the body element with the rendered tree.
+
+ Instead of iterating over all body children to remove/re-append them
+ one-by-one (O(n) lxml operations, each with internal bookkeeping),
+ we swap the entire element in the document root using
+ root.remove() + root.insert(). This is O(1) since the root element
+ () has only a handful of direct children.
+
+ The body's index is located first so document element order is
+ preserved (e.g. body before sectPr).
+
+ SAFETY: If the body is not a direct child of root (malformed template)
+ or if remove/insert raises for any reason, we fall back to the slower
+ child-by-child copy so rendering is never broken.
+ """
root = self.docx._element
- body = root.body
- root.replace(body, tree)
+ old_body = root.body
+
+ # Find where sits among root's direct children so we can
+ # re-insert the new tree at the same position.
+ body_index = None
+ for i, child in enumerate(root):
+ if child is old_body:
+ body_index = i
+ break
+
+ if body_index is None:
+ # Malformed template – body is not a direct child of root.
+ # Fall back to child-by-child replacement on the existing body.
+ for child in list(old_body):
+ old_body.remove(child)
+ for child in list(tree):
+ old_body.append(child)
+ return
+
+ try:
+ # Detach the old body and insert the new tree (which is itself a
+ # element returned by fix_tables/parse_xml) at the same
+ # position. This avoids O(n) per-child remove/append calls.
+ root.remove(old_body)
+ root.insert(body_index, tree)
+ except Exception:
+ # If something went wrong, restore the document to a usable state
+ # by re-attaching the old body (if it was already detached) and
+ # falling back to child-by-child copy.
+ if old_body.getparent() is None:
+ root.insert(body_index, old_body)
+ for child in list(old_body):
+ old_body.remove(child)
+ for child in list(tree):
+ old_body.append(child)
def get_headers_footers(self, uri):
for relKey, val in self.docx._part.rels.items():
@@ -455,13 +654,52 @@ def get_headers_footers_encoding(self, xml):
return m.group(1)
return "utf-8"
+ @classmethod
+ def _get_jinja_start_pattern(cls, delimiter):
+ pattern = cls._JINJA_START_PATTERNS.get(delimiter)
+ if pattern is None:
+ # Word can split a delimiter across XML runs, e.g. "[[" may become
+ # "[...[". Allow XML tags between delimiter characters.
+ pattern = re.compile(
+ r"(<[^>]*>)*".join(re.escape(char) for char in delimiter),
+ re.DOTALL,
+ )
+ cls._JINJA_START_PATTERNS[delimiter] = pattern
+ return pattern
+
+ def _has_jinja_tags(self, xml, jinja_env=None):
+ if jinja_env is None:
+ jinja_env = _get_cached_env()
+
+ # Use the active environment's delimiters so custom Jinja syntax in
+ # headers/footers is detected by the same rules used during rendering.
+ start_strings = (
+ jinja_env.block_start_string,
+ jinja_env.variable_start_string,
+ jinja_env.comment_start_string,
+ )
+ return any(
+ # Fast path for intact delimiters; regex path catches delimiters
+ # fragmented by Word's XML run markup.
+ start_string in xml
+ or self._get_jinja_start_pattern(start_string).search(xml)
+ for start_string in start_strings
+ )
+
def build_headers_footers_xml(self, context, uri, jinja_env=None):
for relKey, part in self.get_headers_footers(uri):
xml = self.get_part_xml(part)
+
encoding = self.get_headers_footers_encoding(xml)
- xml = self.patch_xml(xml)
- xml = self.render_xml_part(xml, part, context, jinja_env)
- yield relKey, xml.encode(encoding)
+
+ # Skip rendering if no Jinja tags present. Use the active Jinja
+ # environment so custom delimiters in headers/footers are honored.
+ if self._has_jinja_tags(xml, jinja_env):
+ xml = self.patch_xml(xml)
+ xml = self.render_xml_part(xml, part, context, jinja_env)
+ yield relKey, xml.encode(encoding)
+ else:
+ yield relKey, xml.encode(encoding)
def map_headers_footers_xml(self, relKey, xml):
part = self.docx._part.rels[relKey].target_part
@@ -479,58 +717,102 @@ def render(
# init template working attributes
self.render_init()
- if autoescape:
- if not jinja_env:
- jinja_env = Environment(autoescape=autoescape)
- else:
- jinja_env.autoescape = autoescape
+ # Use cached environment by default
+ if not jinja_env:
+ jinja_env = _get_cached_env(autoescape=autoescape)
+ elif autoescape:
+ jinja_env.autoescape = autoescape
# Body
xml_src = self.build_xml(context, jinja_env)
- # fix tables if needed
+ # Fix tables if needed
tree = self.fix_tables(xml_src)
- # fix docPr ID's
+ # Fix docPr ID's
self.fix_docpr_ids(tree)
# Replace body xml tree
self.map_tree(tree)
- # Headers
- headers = self.build_headers_footers_xml(context, self.HEADER_URI, jinja_env)
- for relKey, xml in headers:
- self.map_headers_footers_xml(relKey, xml)
-
- # Footers
- footers = self.build_headers_footers_xml(context, self.FOOTER_URI, jinja_env)
- for relKey, xml in footers:
- self.map_headers_footers_xml(relKey, xml)
-
+ # Headers & Footers - skip when no Jinja tags are present.
+ for uri in (self.HEADER_URI, self.FOOTER_URI):
+ try:
+ has_jinja = any(
+ self._has_jinja_tags(xml, jinja_env)
+ for xml in (
+ self.get_part_xml(part)
+ for _relKey, part in self.get_headers_footers(uri)
+ )
+ )
+ if has_jinja:
+ for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env):
+ self.map_headers_footers_xml(relKey, xml)
+ except Exception:
+ # Fallback: guards against unexpected part structure (e.g. blob
+ # is None, missing attributes). Not malformed XML - that would
+ # fail in build_headers_footers_xml regardless.
+ for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env):
+ self.map_headers_footers_xml(relKey, xml)
+
+ # Properties: no skip-check needed - these are a handful of short
+ # strings (author, title, etc.) where from_string() is near-zero cost.
self.render_properties(context, jinja_env)
+ # Footnotes: no skip-check needed - at most one part exists in typical
+ # documents, and many have none, so the loop body rarely executes.
self.render_footnotes(context, jinja_env)
# set rendered flag
self.is_rendered = True
- # using of TC tag in for cycle can cause that count of columns does not
- # correspond to real count of columns in row. This function is able to fix it.
+ # Using of TC tag in for cycle can cause that count of columns does not
+ # correspond to real count of columns in row.
def fix_tables(self, xml):
- parser = etree.XMLParser(recover=True)
- tree = etree.fromstring(xml, parser=parser)
+ # Use parse_xml with safe fallback for malformed XML
+ try:
+ tree = parse_xml(xml) # parse_xml() is significantly faster
+ except Exception:
+ # Fallback to permissive parser in the event of malformed XML
+ parser = etree.XMLParser(recover=True)
+ tree = etree.fromstring(xml, parser=parser)
# get namespace
ns = "{" + tree.nsmap["w"] + "}"
# walk trough xml and find table
for t in tree.iter(ns + "tbl"):
tblGrid = t.find(ns + "tblGrid")
+ if tblGrid is None:
+ continue
+
columns = tblGrid.findall(ns + "gridCol")
- to_add = 0
- # walk trough all rows and try to find if there is higher cell count
+ columns_len = len(columns)
+
+ # Single pass row analysis with both counters
+ max_raw_cells = 0 # For ADD decision (raw tc count)
+ max_effective_cells = 0 # For REMOVE decision (with gridSpan)
+
for r in t.iter(ns + "tr"):
cells = r.findall(ns + "tc")
- if (len(columns) + to_add) < len(cells):
- to_add = len(cells) - len(columns)
+ raw_count = len(cells)
+ effective_count = 0
+
+ for cell in cells:
+ tc_pr = cell.find(ns + "tcPr")
+ if tc_pr is not None:
+ grid_span = tc_pr.find(ns + "gridSpan")
+ if grid_span is not None:
+ effective_count += int(grid_span.get(ns + "val"))
+ continue
+ effective_count += 1
+
+ if raw_count > max_raw_cells:
+ max_raw_cells = raw_count
+ if effective_count > max_effective_cells:
+ max_effective_cells = effective_count
+
+ # ADD columns based on RAW cell count (original behavior)
+ to_add = max_raw_cells - columns_len if max_raw_cells > columns_len else 0
+
# is necessary to add columns?
if to_add > 0:
# at first, calculate width of table according to columns
@@ -552,34 +834,16 @@ def fix_tables(self, xml):
int(float(c.get(ns + "w")) * new_average / old_average)
),
)
- # add new columns
+ # add new columns using OxmlElement for proper python-docx compatibility
for i in range(to_add):
- etree.SubElement(
- tblGrid, ns + "gridCol", {ns + "w": str(int(new_average))}
- )
+ new_col = OxmlElement('w:gridCol')
+ new_col.set(qn('w:w'), str(int(new_average)))
+ tblGrid.append(new_col)
- # Refetch columns after columns addition.
+ # REMOVE columns based on EFFECTIVE cell count (original behavior)
columns = tblGrid.findall(ns + "gridCol")
columns_len = len(columns)
-
- cells_len_max = 0
-
- def get_cell_len(total, cell):
- tc_pr = cell.find(ns + "tcPr")
- grid_span = None if tc_pr is None else tc_pr.find(ns + "gridSpan")
-
- if grid_span is not None:
- return total + int(grid_span.get(ns + "val"))
-
- return total + 1
-
- # Calculate max of table cells to compare with `gridCol`.
- for r in t.iter(ns + "tr"):
- cells = r.findall(ns + "tc")
- cells_len = functools.reduce(get_cell_len, cells, 0)
- cells_len_max = max(cells_len_max, cells_len)
-
- to_remove = columns_len - cells_len_max
+ to_remove = columns_len - max_effective_cells if columns_len > max_effective_cells else 0
# If after the loop, there're less columns, than
# originally was, remove extra `gridCol` declarations.
@@ -608,8 +872,11 @@ def get_cell_len(total, cell):
return tree
def fix_docpr_ids(self, tree):
- # some Ids may have some collisions : so renumbering all of them :
- for elt in tree.xpath("//wp:docPr", namespaces=docx.oxml.ns.nsmap):
+ # Some Ids may have some collisions : so renumbering all of them
+ wp_ns = docx.oxml.ns.nsmap['wp']
+ tag = "{%s}docPr" % wp_ns
+
+ for elt in tree.iter(tag):
self.docx_ids_index += 1
elt.attrib["id"] = str(self.docx_ids_index)
@@ -913,7 +1180,7 @@ def get_undeclared_template_variables(
if jinja_env:
env = jinja_env
else:
- env = Environment()
+ env = _get_cached_env()
parse_content = env.parse(xml)
all_variables = meta.find_undeclared_variables(parse_content)
diff --git a/pyproject.toml b/pyproject.toml
index 476fc0e..28533c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,7 +42,10 @@ repository = "https://github.com/elapouya/python-docx-template.git"
document = "https://docxtpl.readthedocs.org"
[tool.poetry]
+name = "docxtpl"
version = "0.0.0"
+description = "Python docx template engine"
+authors = ["Eric Lapouyade "]
[tool.poetry.requires-plugins]
poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] }
diff --git a/tests/header_footer_custom_delimiters.py b/tests/header_footer_custom_delimiters.py
new file mode 100644
index 0000000..05d3329
--- /dev/null
+++ b/tests/header_footer_custom_delimiters.py
@@ -0,0 +1,52 @@
+import os
+import zipfile
+
+import jinja2
+from docx import Document
+
+from docxtpl import DocxTemplate
+
+
+template_path = "output/header_footer_custom_delimiters_tpl.docx"
+output_path = "output/header_footer_custom_delimiters.docx"
+
+os.makedirs("output", exist_ok=True)
+
+document = Document()
+document.add_paragraph("Body text")
+section = document.sections[0]
+section.header.paragraphs[0].text = "[[ date ]]"
+section.footer.paragraphs[0].text = "[[ company_name ]]"
+document.save(template_path)
+
+jinja_env = jinja2.Environment(
+ variable_start_string="[[",
+ variable_end_string="]]",
+)
+
+tpl = DocxTemplate(template_path)
+tpl.render(
+ {
+ "company_name": "The World Wide company",
+ "date": "2016-03-17",
+ },
+ jinja_env=jinja_env,
+)
+tpl.save(output_path)
+
+with zipfile.ZipFile(output_path) as docx_zip:
+ header_xml = "\n".join(
+ docx_zip.read(name).decode("utf-8")
+ for name in docx_zip.namelist()
+ if name.startswith("word/header")
+ )
+ footer_xml = "\n".join(
+ docx_zip.read(name).decode("utf-8")
+ for name in docx_zip.namelist()
+ if name.startswith("word/footer")
+ )
+
+assert "2016-03-17" in header_xml
+assert "The World Wide company" in footer_xml
+assert "[[" not in header_xml
+assert "[[" not in footer_xml
diff --git a/tests/inline_image_file_like_cache.py b/tests/inline_image_file_like_cache.py
new file mode 100644
index 0000000..1b6784d
--- /dev/null
+++ b/tests/inline_image_file_like_cache.py
@@ -0,0 +1,30 @@
+import io
+import re
+
+from docxtpl import DocxTemplate, InlineImage
+
+
+def image_bytes(path):
+ with open(path, "rb") as image_file:
+ return image_file.read()
+
+
+def embedded_rid(xml):
+ return re.search(r'r:embed="([^"]+)"', xml).group(1)
+
+
+tpl = DocxTemplate("templates/inline_image_tpl.docx")
+tpl.render_init()
+tpl.current_rendering_part = tpl.docx._part
+
+stream = io.BytesIO(image_bytes("templates/django.png"))
+first_xml = str(InlineImage(tpl, stream))
+
+stream.seek(0)
+stream.truncate()
+stream.write(image_bytes("templates/python.png"))
+stream.seek(0)
+second_xml = str(InlineImage(tpl, stream))
+
+assert embedded_rid(first_xml) != embedded_rid(second_xml)
+assert tpl._image_cache == {}
diff --git a/tests/inline_image_xml_template.py b/tests/inline_image_xml_template.py
new file mode 100644
index 0000000..b5393ab
--- /dev/null
+++ b/tests/inline_image_xml_template.py
@@ -0,0 +1,45 @@
+from docx.oxml import parse_xml
+
+import docxtpl.inline_image as inline_image
+
+
+_INLINE_IMAGE_XML = inline_image._get_inline_image_xml_template()
+
+assert _INLINE_IMAGE_XML.count("{shape_id}") == 2
+assert _INLINE_IMAGE_XML.count("{cx}") == 2
+assert _INLINE_IMAGE_XML.count("{cy}") == 2
+assert _INLINE_IMAGE_XML.count("{rId}") == 1
+assert _INLINE_IMAGE_XML.count("{filename}") == 1
+
+parse_xml(
+ _INLINE_IMAGE_XML.format(
+ shape_id=1,
+ cx=2,
+ cy=3,
+ rId="rId1",
+ filename="image.png",
+ )
+)
+
+
+def raise_incompatible_template():
+ raise RuntimeError("incompatible template")
+
+
+original_template = inline_image._INLINE_IMAGE_XML
+original_builder = inline_image._build_inline_image_xml_template
+try:
+ inline_image._INLINE_IMAGE_XML = None
+ inline_image._build_inline_image_xml_template = raise_incompatible_template
+ fallback_xml = inline_image._format_inline_image_xml(
+ shape_id=1,
+ rId="rId1",
+ filename='quoted " image.png',
+ cx=2,
+ cy=3,
+ )
+ parse_xml(fallback_xml)
+ assert 'name="quoted " image.png"' in fallback_xml
+finally:
+ inline_image._INLINE_IMAGE_XML = original_template
+ inline_image._build_inline_image_xml_template = original_builder