feat: add comprehensive GitHub workflow and development tools

2025-09-06 18:31:54 +02:00
commit ab23d7187e
10224 changed files with 2075210 additions and 0 deletions
--- a/app/.venv/Lib/site-packages/weasyprint/text/line_break.py
+++ b/app/.venv/Lib/site-packages/weasyprint/text/line_break.py
@@ -0,0 +1,545 @@
+"""Decide where to break text lines."""
+
+import re
+from math import inf
+
+import pyphen
+
+from .constants import LST_TO_ISO, PANGO_WRAP_MODE
+from .fonts import font_features, get_font_description
+
+from .ffi import (  # isort:skip
+    ffi, gobject, pango, pangoft2, unicode_to_char_p, units_from_double,
+    units_to_double)
+
+
+def line_size(line, style):
+    """Get logical width and height of the given ``line``.
+
+    ``style`` is used to add letter spacing (if needed).
+
+    """
+    logical_extents = ffi.new('PangoRectangle *')
+    pango.pango_layout_line_get_extents(line, ffi.NULL, logical_extents)
+    width = units_to_double(logical_extents.width)
+    height = units_to_double(logical_extents.height)
+    ffi.release(logical_extents)
+    if style['letter_spacing'] != 'normal':
+        width += style['letter_spacing']
+    return width, height
+
+
+def first_line_metrics(first_line, text, layout, resume_at, space_collapse,
+                       style, hyphenated=False, hyphenation_character=None):
+    length = first_line.length
+    if hyphenated:
+        length -= len(hyphenation_character.encode())
+    elif resume_at:
+        # Set an infinite width as we don't want to break lines when drawing,
+        # the lines have already been split and the size may differ. Rendering
+        # is also much faster when no width is set.
+        pango.pango_layout_set_width(layout.layout, -1)
+
+        # Create layout with final text
+        first_line_text = text.encode()[:length].decode()
+
+        # Remove trailing spaces if spaces collapse
+        if space_collapse:
+            first_line_text = first_line_text.rstrip(' ')
+
+        layout.set_text(first_line_text)
+        first_line, _ = layout.get_first_line()
+        length = first_line.length if first_line is not None else 0
+
+    width, height = line_size(first_line, style)
+    baseline = units_to_double(pango.pango_layout_get_baseline(layout.layout))
+    layout.deactivate()
+    return layout, length, resume_at, width, height, baseline
+
+
+class Layout:
+    """Object holding PangoLayout-related cdata pointers."""
+    def __init__(self, context, style, justification_spacing=0,
+                 max_width=None):
+        self.justification_spacing = justification_spacing
+        self.setup(context, style)
+        self.max_width = max_width
+
+    def setup(self, context, style):
+        self.context = context
+        self.style = style
+        self.first_line_direction = 0
+
+        if context is None:
+            font_map = ffi.gc(
+                pangoft2.pango_ft2_font_map_new(), gobject.g_object_unref)
+        else:
+            font_map = context.font_config.font_map
+        pango_context = ffi.gc(
+            pango.pango_font_map_create_context(font_map),
+            gobject.g_object_unref)
+        pango.pango_context_set_round_glyph_positions(pango_context, False)
+
+        if style['font_language_override'] != 'normal':
+            lang_p, lang = unicode_to_char_p(LST_TO_ISO.get(
+                style['font_language_override'].lower(),
+                style['font_language_override']))
+        elif style['lang']:
+            lang_p, lang = unicode_to_char_p(style['lang'])
+        else:
+            lang = None
+            self.language = pango.pango_language_get_default()
+        if lang:
+            self.language = pango.pango_language_from_string(lang_p)
+            pango.pango_context_set_language(pango_context, self.language)
+
+        assert not isinstance(style['font_family'], str), (
+            'font_family should be a list')
+        font_description = get_font_description(style)
+        self.layout = ffi.gc(
+            pango.pango_layout_new(pango_context),
+            gobject.g_object_unref)
+        pango.pango_layout_set_font_description(self.layout, font_description)
+
+        text_decoration = style['text_decoration_line']
+        if text_decoration != 'none':
+            metrics = ffi.gc(
+                pango.pango_context_get_metrics(
+                    pango_context, font_description, self.language),
+                pango.pango_font_metrics_unref)
+            self.ascent = units_to_double(
+                pango.pango_font_metrics_get_ascent(metrics))
+            self.underline_position = units_to_double(
+                pango.pango_font_metrics_get_underline_position(metrics))
+            self.strikethrough_position = units_to_double(
+                pango.pango_font_metrics_get_strikethrough_position(metrics))
+            self.underline_thickness = units_to_double(
+                pango.pango_font_metrics_get_underline_thickness(metrics))
+            self.strikethrough_thickness = units_to_double(
+                pango.pango_font_metrics_get_strikethrough_thickness(metrics))
+        else:
+            self.ascent = None
+            self.underline_position = None
+            self.strikethrough_position = None
+
+        features = font_features(
+            style['font_kerning'], style['font_variant_ligatures'],
+            style['font_variant_position'], style['font_variant_caps'],
+            style['font_variant_numeric'], style['font_variant_alternates'],
+            style['font_variant_east_asian'], style['font_feature_settings'])
+        if features and context:
+            features = ','.join(
+                f'{key} {value}' for key, value in features.items()).encode()
+            # In the meantime, keep a cache to avoid leaking too many of them.
+            attr = context.font_features.setdefault(
+                features, pango.pango_attr_font_features_new(features))
+            attr_list = pango.pango_attr_list_new()
+            pango.pango_attr_list_insert(attr_list, attr)
+            pango.pango_layout_set_attributes(self.layout, attr_list)
+
+    def get_first_line(self):
+        first_line = pango.pango_layout_get_line_readonly(self.layout, 0)
+        second_line = pango.pango_layout_get_line_readonly(self.layout, 1)
+        index = None if second_line == ffi.NULL else second_line.start_index
+        self.first_line_direction = first_line.resolved_dir
+        return first_line, index
+
+    def set_text(self, text, justify=False):
+        index = text.find('\n')
+        if index != -1:
+            # Keep only the first line plus one character, we don't need more
+            text = text[:index+2]
+        self.text = text
+        text, bytestring = unicode_to_char_p(text)
+        pango.pango_layout_set_text(self.layout, text, -1)
+
+        word_spacing = self.style['word_spacing']
+        if justify:
+            # Justification is needed when drawing text but is useless during
+            # layout, when it can be ignored.
+            word_spacing += self.justification_spacing
+
+        letter_spacing = self.style['letter_spacing']
+        if letter_spacing == 'normal':
+            letter_spacing = 0
+
+        word_breaking = (
+            self.style['overflow_wrap'] in ('anywhere', 'break-word'))
+
+        if self.text and (word_spacing or letter_spacing or word_breaking):
+            attr_list = pango.pango_layout_get_attributes(self.layout)
+            if attr_list == ffi.NULL:
+                attr_list = ffi.gc(
+                    pango.pango_attr_list_new(),
+                    pango.pango_attr_list_unref)
+
+            def add_attr(start, end, spacing):
+                attr = pango.pango_attr_letter_spacing_new(spacing)
+                attr.start_index, attr.end_index = start, end
+                pango.pango_attr_list_change(attr_list, attr)
+
+            if letter_spacing:
+                letter_spacing = units_from_double(letter_spacing)
+                add_attr(0, len(bytestring), letter_spacing)
+
+            if word_spacing:
+                if bytestring == b' ':
+                    # We need more than one space to set word spacing
+                    self.text = ' \u200b'  # Space + zero-width space
+                    text, bytestring = unicode_to_char_p(self.text)
+                    pango.pango_layout_set_text(self.layout, text, -1)
+
+                space_spacing = (
+                    units_from_double(word_spacing) + letter_spacing)
+                position = bytestring.find(b' ')
+                # Pango gives only half of word-spacing on boundaries
+                boundary_positions = (0, len(bytestring) - 1)
+                while position != -1:
+                    factor = 1 + (position in boundary_positions)
+                    add_attr(position, position + 1, factor * space_spacing)
+                    position = bytestring.find(b' ', position + 1)
+
+            if word_breaking:
+                attr = pango.pango_attr_insert_hyphens_new(False)
+                attr.start_index, attr.end_index = 0, len(bytestring)
+                pango.pango_attr_list_change(attr_list, attr)
+
+            pango.pango_layout_set_attributes(self.layout, attr_list)
+
+        # Tabs width
+        if b'\t' in bytestring:
+            self.set_tabs()
+
+    def set_tabs(self):
+        if isinstance(self.style['tab_size'], int):
+            layout = Layout(
+                self.context, self.style, self.justification_spacing)
+            layout.set_text(' ' * self.style['tab_size'])
+            line, _ = layout.get_first_line()
+            width, _ = line_size(line, self.style)
+            width = int(round(width))
+        else:
+            width = int(self.style['tab_size'].value)
+        # 0 is not handled correctly by Pango
+        array = ffi.gc(
+            pango.pango_tab_array_new_with_positions(
+                1, True, pango.PANGO_TAB_LEFT, width or 1),
+            pango.pango_tab_array_free)
+        pango.pango_layout_set_tabs(self.layout, array)
+
+    def deactivate(self):
+        del self.layout, self.language, self.style
+
+    def reactivate(self, style):
+        self.setup(self.context, style)
+        self.set_text(self.text, justify=True)
+
+
+def create_layout(text, style, context, max_width, justification_spacing):
+    """Return an opaque Pango layout with default Pango line-breaks."""
+    layout = Layout(context, style, justification_spacing, max_width)
+
+    # Make sure that max_width * Pango.SCALE == max_width * 1024 fits in a
+    # signed integer. Treat bigger values same as None: unconstrained width.
+    text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line')
+    if max_width is not None and text_wrap and max_width < 2 ** 21:
+        pango.pango_layout_set_width(
+            layout.layout, units_from_double(max(0, max_width)))
+
+    layout.set_text(text)
+    return layout
+
+
+def split_first_line(text, style, context, max_width, justification_spacing,
+                     is_line_start=True, minimum=False):
+    """Fit as much as possible in the available width for one line of text.
+
+    Return ``(layout, length, resume_index, width, height, baseline)``.
+
+    ``layout``: a pango Layout with the first line
+    ``length``: length in UTF-8 bytes of the first line
+    ``resume_index``: The number of UTF-8 bytes to skip for the next line.
+                      May be ``None`` if the whole text fits in one line.
+                      This may be greater than ``length`` in case of preserved
+                      newline characters.
+    ``width``: width in pixels of the first line
+    ``height``: height in pixels of the first line
+    ``baseline``: baseline in pixels of the first line
+
+    """
+    # See https://www.w3.org/TR/css-text-3/#white-space-property
+    text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line')
+    space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line')
+
+    original_max_width = max_width
+    if not text_wrap:
+        max_width = None
+
+    # Step #1: Get a draft layout with the first line
+    if max_width is not None and max_width != inf and style['font_size']:
+        short_text = text
+        if max_width == 0:
+            # Trying to find minimum size, let's naively split on spaces and
+            # keep one word + one letter
+            space_index = text.find(' ')
+            if space_index != -1:
+                short_text = text[:space_index+2]  # index + space + one letter
+        else:
+            short_text = text[:int(max_width / style['font_size'] * 2.5)]
+        # Try to use a small amount of text instead of the whole text
+        layout = create_layout(
+            short_text, style, context, max_width, justification_spacing)
+        first_line, resume_index = layout.get_first_line()
+        if resume_index is None and short_text != text:
+            # The small amount of text fits in one line, give up and use
+            # the whole text
+            layout.set_text(text)
+            first_line, resume_index = layout.get_first_line()
+    else:
+        layout = create_layout(
+            text, style, context, original_max_width, justification_spacing)
+        first_line, resume_index = layout.get_first_line()
+
+    # Step #2: Don't split lines when it's not needed
+    if max_width is None:
+        # The first line can take all the place needed
+        return first_line_metrics(
+            first_line, text, layout, resume_index, space_collapse, style)
+    first_line_width, _ = line_size(first_line, style)
+    if resume_index is None and first_line_width <= max_width:
+        # The first line fits in the available width
+        return first_line_metrics(
+            first_line, text, layout, resume_index, space_collapse, style)
+
+    # Step #3: Try to put the first word of the second line on the first line
+    # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
+    # is a good thread related to this problem.
+    first_line_text = text.encode()[:resume_index].decode()
+    first_line_fits = (
+        first_line_width <= max_width or
+        ' ' in first_line_text.strip() or
+        can_break_text(first_line_text.strip(), style['lang']))
+    if first_line_fits:
+        # The first line fits but may have been cut too early by Pango
+        second_line_text = text.encode()[resume_index:].decode()
+    else:
+        # The line can't be split earlier, try to hyphenate the first word.
+        first_line_text = ''
+        second_line_text = text
+
+    next_word = second_line_text.split(' ', 1)[0]
+    if next_word:
+        if space_collapse:
+            # next_word might fit without a space afterwards
+            # only try when space collapsing is allowed
+            new_first_line_text = first_line_text + next_word
+            layout.set_text(new_first_line_text)
+            first_line, resume_index = layout.get_first_line()
+            if resume_index is None:
+                if first_line_text:
+                    # The next word fits in the first line, keep the layout
+                    resume_index = len(new_first_line_text.encode()) + 1
+                    return first_line_metrics(
+                        first_line, text, layout, resume_index, space_collapse,
+                        style)
+                else:
+                    # Second line is None
+                    resume_index = first_line.length + 1
+                    if resume_index >= len(text.encode()):
+                        resume_index = None
+    elif first_line_text:
+        # We found something on the first line but we did not find a word on
+        # the next line, no need to hyphenate, we can keep the current layout
+        return first_line_metrics(
+            first_line, text, layout, resume_index, space_collapse, style)
+
+    # Step #4: Try to hyphenate
+    hyphens = style['hyphens']
+    lang = style['lang'] and pyphen.language_fallback(style['lang'])
+    total, left, right = style['hyphenate_limit_chars']
+    hyphenated = False
+    soft_hyphen = '\xad'
+
+    auto_hyphenation = manual_hyphenation = False
+    if hyphens != 'none':
+        manual_hyphenation = soft_hyphen in first_line_text + next_word
+    if hyphens == 'auto' and lang:
+        next_word_boundaries = get_next_word_boundaries(second_line_text, lang)
+        if next_word_boundaries:
+            # We have a word to hyphenate
+            start_word, stop_word = next_word_boundaries
+            next_word = second_line_text[start_word:stop_word]
+            if stop_word - start_word >= total:
+                # This word is long enough
+                first_line_width, _ = line_size(first_line, style)
+                space = max_width - first_line_width
+                if style['hyphenate_limit_zone'].unit == '%':
+                    limit_zone = (
+                        max_width * style['hyphenate_limit_zone'].value / 100)
+                else:
+                    limit_zone = style['hyphenate_limit_zone'].value
+                if space > limit_zone or space < 0:
+                    # Available space is worth the try, or the line is even too
+                    # long to fit: try to hyphenate
+                    auto_hyphenation = True
+
+    # Automatic hyphenation opportunities within a word must be ignored if the
+    # word contains a conditional hyphen, in favor of the conditional
+    # hyphen(s).
+    # See https://drafts.csswg.org/css-text-3/#valdef-hyphens-auto
+    if manual_hyphenation:
+        # Manual hyphenation: check that the line ends with a soft
+        # hyphen and add the missing hyphen
+        if first_line_text.endswith(soft_hyphen):
+            # The first line has been split on a soft hyphen
+            if ' ' in first_line_text:
+                first_line_text, next_word = first_line_text.rsplit(' ', 1)
+                next_word = f' {next_word}'
+                layout.set_text(first_line_text)
+                first_line, _ = layout.get_first_line()
+                resume_index = len((f'{first_line_text} ').encode())
+            else:
+                first_line_text, next_word = '', first_line_text
+        soft_hyphen_indexes = [
+            match.start() for match in re.finditer(soft_hyphen, next_word)]
+        soft_hyphen_indexes.reverse()
+        dictionary_iterations = [next_word[:i+1] for i in soft_hyphen_indexes]
+        start_word = 0
+    elif auto_hyphenation:
+        dictionary_key = (lang, left, right, total)
+        dictionary = context.dictionaries.get(dictionary_key)
+        if dictionary is None:
+            dictionary = pyphen.Pyphen(lang=lang, left=left, right=right)
+            context.dictionaries[dictionary_key] = dictionary
+        dictionary_iterations = [
+            start for start, end in dictionary.iterate(next_word)]
+    else:
+        dictionary_iterations = []
+
+    if dictionary_iterations:
+        for first_word_part in dictionary_iterations:
+            new_first_line_text = (
+                first_line_text +
+                second_line_text[:start_word] +
+                first_word_part)
+            hyphenated_first_line_text = (
+                new_first_line_text + style['hyphenate_character'])
+            new_layout = create_layout(
+                hyphenated_first_line_text, style, context, max_width,
+                justification_spacing)
+            new_first_line, index = new_layout.get_first_line()
+            new_first_line_width, _ = line_size(new_first_line, style)
+            new_space = max_width - new_first_line_width
+            hyphenated = index is None and (
+                new_space >= 0 or first_word_part == dictionary_iterations[-1])
+            if hyphenated:
+                layout = new_layout
+                first_line = new_first_line
+                resume_index = len(new_first_line_text.encode())
+                break
+
+        if not hyphenated and not first_line_text:
+            # Recreate the layout with no max_width to be sure that
+            # we don't break before or inside the hyphenate character
+            hyphenated = True
+            layout.set_text(hyphenated_first_line_text)
+            pango.pango_layout_set_width(layout.layout, -1)
+            first_line, _ = layout.get_first_line()
+            resume_index = len(new_first_line_text.encode())
+            if text[len(first_line_text)] == soft_hyphen:
+                resume_index += len(soft_hyphen.encode())
+
+    if not hyphenated and first_line_text.endswith(soft_hyphen):
+        # Recreate the layout with no max_width to be sure that
+        # we don't break inside the hyphenate-character string
+        hyphenated = True
+        hyphenated_first_line_text = (
+            first_line_text + style['hyphenate_character'])
+        layout.set_text(hyphenated_first_line_text)
+        pango.pango_layout_set_width(layout.layout, -1)
+        first_line, _ = layout.get_first_line()
+        resume_index = len(first_line_text.encode())
+
+    # Step 5: Try to break word if it's too long for the line
+    overflow_wrap = style['overflow_wrap']
+    first_line_width, _ = line_size(first_line, style)
+    space = max_width - first_line_width
+    # If we can break words and the first line is too long
+    can_break = (
+        style['word_break'] == 'break-all' or (
+            is_line_start and (
+                overflow_wrap == 'anywhere' or
+                (overflow_wrap == 'break-word' and not minimum))))
+    if space < 0 and can_break:
+        # Is it really OK to remove hyphenation for word-break ?
+        hyphenated = False
+        # TODO: Modify code to preserve W3C condition:
+        # "Shaping characters are still shaped as if the word were not broken"
+        # The way new lines are processed in this function (one by one with no
+        # memory of the last) prevents shaping characters (arabic, for
+        # instance) from keeping their shape when wrapped on the next line with
+        # pango layout. Maybe insert Unicode shaping characters in text?
+        layout.set_text(text)
+        pango.pango_layout_set_width(
+            layout.layout, units_from_double(max_width))
+        pango.pango_layout_set_wrap(
+            layout.layout, PANGO_WRAP_MODE['WRAP_CHAR'])
+        first_line, index = layout.get_first_line()
+        resume_index = index or first_line.length
+        if resume_index >= len(text.encode()):
+            resume_index = None
+
+    return first_line_metrics(
+        first_line, text, layout, resume_index, space_collapse, style,
+        hyphenated, style['hyphenate_character'])
+
+
+def get_log_attrs(text, lang):
+    if lang:
+        lang_p, lang = unicode_to_char_p(lang)
+    else:
+        lang = None
+        language = pango.pango_language_get_default()
+    if lang:
+        language = pango.pango_language_from_string(lang_p)
+    # TODO: this should be removed when bidi is supported
+    for char in ('\u202a', '\u202b', '\u202c', '\u202d', '\u202e'):
+        text = text.replace(char, '\u200b')
+    text_p, bytestring = unicode_to_char_p(text)
+    length = len(text) + 1
+    log_attrs = ffi.new('PangoLogAttr[]', length)
+    pango.pango_get_log_attrs(
+        text_p, len(bytestring), -1, language, log_attrs, length)
+    return bytestring, log_attrs
+
+
+def can_break_text(text, lang):
+    if not text or len(text) < 2:
+        return None
+    bytestring, log_attrs = get_log_attrs(text, lang)
+    length = len(text) + 1
+    return any(attr.is_line_break for attr in log_attrs[1:length - 1])
+
+
+def get_next_word_boundaries(text, lang):
+    if not text or len(text) < 2:
+        return None
+    bytestring, log_attrs = get_log_attrs(text, lang)
+    for i, attr in enumerate(log_attrs):
+        if attr.is_word_end:
+            word_end = i
+            break
+        if attr.is_word_boundary:
+            word_start = i
+    else:
+        return None
+    return word_start, word_end
+
+
+def get_last_word_end(text, lang):
+    if not text or len(text) < 2:
+        return None
+    bytestring, log_attrs = get_log_attrs(text, lang)
+    for i, attr in enumerate(list(log_attrs)[::-1]):
+        if i and attr.is_word_end:
+            return len(text) - i