feat: add comprehensive GitHub workflow and development tools

This commit is contained in:
Stiftung Development
2025-09-06 18:31:54 +02:00
commit ab23d7187e
10224 changed files with 2075210 additions and 0 deletions

View File

@@ -0,0 +1,302 @@
"""PDF generation management."""
import pydyf
from .. import VERSION, Attachment
from ..html import W3C_DATE_RE
from ..logger import LOGGER, PROGRESS_LOGGER
from ..matrix import Matrix
from . import pdfa, pdfua
from .fonts import build_fonts_dictionary
from .stream import Stream
from .anchors import ( # isort:skip
add_annotations, add_inputs, add_links, add_outlines, resolve_links,
write_pdf_attachment)
VARIANTS = {
name: data for variants in (pdfa.VARIANTS, pdfua.VARIANTS)
for (name, data) in variants.items()}
def _w3c_date_to_pdf(string, attr_name):
"""Tranform W3C date to PDF format."""
if string is None:
return None
match = W3C_DATE_RE.match(string)
if match is None:
LOGGER.warning(f'Invalid {attr_name} date: {string!r}')
return None
groups = match.groupdict()
pdf_date = ''
found = groups['hour']
for key in ('second', 'minute', 'hour', 'day', 'month', 'year'):
if groups[key]:
found = True
pdf_date = groups[key] + pdf_date
elif found:
pdf_date = f'{(key in ("day", "month")):02d}{pdf_date}'
if groups['hour']:
assert groups['minute']
if groups['tz_hour']:
assert groups['tz_hour'].startswith(('+', '-'))
assert groups['tz_minute']
tz_hour = int(groups['tz_hour'])
tz_minute = int(groups['tz_minute'])
pdf_date += f"{tz_hour:+03d}'{tz_minute:02d}"
else:
pdf_date += 'Z'
return f'D:{pdf_date}'
def _reference_resources(pdf, resources, images, fonts):
if 'Font' in resources:
assert resources['Font'] is None
resources['Font'] = fonts
_use_references(pdf, resources, images)
pdf.add_object(resources)
return resources.reference
def _use_references(pdf, resources, images):
# XObjects
for key, x_object in resources.get('XObject', {}).items():
# Images
if x_object is None:
image_data = images[key]
x_object = image_data['x_object']
if x_object is not None:
# Image already added to PDF
resources['XObject'][key] = x_object.reference
continue
image = image_data['image']
dpi_ratio = max(image_data['dpi_ratios'])
x_object = image.get_x_object(image_data['interpolate'], dpi_ratio)
image_data['x_object'] = x_object
pdf.add_object(x_object)
resources['XObject'][key] = x_object.reference
# Masks
if 'SMask' in x_object.extra:
pdf.add_object(x_object.extra['SMask'])
x_object.extra['SMask'] = x_object.extra['SMask'].reference
# Resources
if 'Resources' in x_object.extra:
x_object.extra['Resources'] = _reference_resources(
pdf, x_object.extra['Resources'], images, resources['Font'])
# Patterns
for key, pattern in resources.get('Pattern', {}).items():
pdf.add_object(pattern)
resources['Pattern'][key] = pattern.reference
if 'Resources' in pattern.extra:
pattern.extra['Resources'] = _reference_resources(
pdf, pattern.extra['Resources'], images, resources['Font'])
# Shadings
for key, shading in resources.get('Shading', {}).items():
pdf.add_object(shading)
resources['Shading'][key] = shading.reference
# Alpha states
for key, alpha in resources.get('ExtGState', {}).items():
if 'SMask' in alpha and 'G' in alpha['SMask']:
alpha['SMask']['G'] = alpha['SMask']['G'].reference
def generate_pdf(document, target, zoom, **options):
# 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
scale = zoom * 0.75
PROGRESS_LOGGER.info('Step 6 - Creating PDF')
# Set properties according to PDF variants
mark = False
variant = options['pdf_variant']
if variant:
variant_function, properties = VARIANTS[variant]
if 'mark' in properties:
mark = properties['mark']
pdf = pydyf.PDF()
states = pydyf.Dictionary()
x_objects = pydyf.Dictionary()
patterns = pydyf.Dictionary()
shadings = pydyf.Dictionary()
images = {}
resources = pydyf.Dictionary({
'ExtGState': states,
'XObject': x_objects,
'Pattern': patterns,
'Shading': shadings,
})
pdf.add_object(resources)
pdf_names = []
# Links and anchors
page_links_and_anchors = list(resolve_links(document.pages))
annot_files = {}
pdf_pages, page_streams = [], []
compress = not options['uncompressed_pdf']
for page_number, (page, links_and_anchors) in enumerate(
zip(document.pages, page_links_and_anchors)):
# Draw from the top-left corner
matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
page_width = scale * (
page.width + page.bleed['left'] + page.bleed['right'])
page_height = scale * (
page.height + page.bleed['top'] + page.bleed['bottom'])
left = -scale * page.bleed['left']
top = -scale * page.bleed['top']
right = left + page_width
bottom = top + page_height
page_rectangle = (
left / scale, top / scale,
(right - left) / scale, (bottom - top) / scale)
stream = Stream(
document.fonts, page_rectangle, states, x_objects, patterns,
shadings, images, mark, compress=compress)
stream.transform(d=-1, f=(page.height * scale))
pdf.add_object(stream)
page_streams.append(stream)
pdf_page = pydyf.Dictionary({
'Type': '/Page',
'Parent': pdf.pages.reference,
'MediaBox': pydyf.Array([left, top, right, bottom]),
'Contents': stream.reference,
'Resources': resources.reference,
})
if mark:
pdf_page['Tabs'] = '/S'
pdf_page['StructParents'] = page_number
pdf.add_page(pdf_page)
pdf_pages.append(pdf_page)
add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, mark)
add_annotations(
links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files,
compress)
add_inputs(
page.inputs, matrix, pdf, pdf_page, resources, stream,
document.font_config.font_map, compress)
page.paint(stream, scale)
# Bleed
bleed = {key: value * 0.75 for key, value in page.bleed.items()}
trim_left = left + bleed['left']
trim_top = top + bleed['top']
trim_right = right - bleed['right']
trim_bottom = bottom - bleed['bottom']
# Arbitrarly set PDF BleedBox between CSS bleed box (MediaBox) and
# CSS page box (TrimBox) at most 10 points from the TrimBox.
bleed_left = trim_left - min(10, bleed['left'])
bleed_top = trim_top - min(10, bleed['top'])
bleed_right = trim_right + min(10, bleed['right'])
bleed_bottom = trim_bottom + min(10, bleed['bottom'])
pdf_page['TrimBox'] = pydyf.Array([
trim_left, trim_top, trim_right, trim_bottom])
pdf_page['BleedBox'] = pydyf.Array([
bleed_left, bleed_top, bleed_right, bleed_bottom])
# Outlines
add_outlines(pdf, document.make_bookmark_tree(scale, transform_pages=True))
PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')
# PDF information
pdf.info['Producer'] = pydyf.String(f'WeasyPrint {VERSION}')
metadata = document.metadata
if metadata.title:
pdf.info['Title'] = pydyf.String(metadata.title)
if metadata.authors:
pdf.info['Author'] = pydyf.String(', '.join(metadata.authors))
if metadata.description:
pdf.info['Subject'] = pydyf.String(metadata.description)
if metadata.keywords:
pdf.info['Keywords'] = pydyf.String(', '.join(metadata.keywords))
if metadata.generator:
pdf.info['Creator'] = pydyf.String(metadata.generator)
if metadata.created:
pdf.info['CreationDate'] = pydyf.String(
_w3c_date_to_pdf(metadata.created, 'created'))
if metadata.modified:
pdf.info['ModDate'] = pydyf.String(
_w3c_date_to_pdf(metadata.modified, 'modified'))
if metadata.lang:
pdf.catalog['Lang'] = pydyf.String(metadata.lang)
if options['custom_metadata']:
for key, value in metadata.custom.items():
key = ''.join(char for char in key if char.isalnum())
key = key.encode('ascii', errors='ignore').decode()
if key:
pdf.info[key] = pydyf.String(value)
# Embedded files
attachments = metadata.attachments.copy()
if options['attachments']:
for attachment in options['attachments']:
if not isinstance(attachment, Attachment):
attachment = Attachment(
attachment, url_fetcher=document.url_fetcher)
attachments.append(attachment)
pdf_attachments = []
for attachment in attachments:
pdf_attachment = write_pdf_attachment(pdf, attachment, compress)
if pdf_attachment is not None:
pdf_attachments.append(pdf_attachment)
if pdf_attachments:
content = pydyf.Dictionary({'Names': pydyf.Array()})
for i, pdf_attachment in enumerate(pdf_attachments):
content['Names'].append(pydyf.String(f'attachment{i}'))
content['Names'].append(pdf_attachment.reference)
pdf.add_object(content)
if 'Names' not in pdf.catalog:
pdf.catalog['Names'] = pydyf.Dictionary()
pdf.catalog['Names']['EmbeddedFiles'] = content.reference
# Embedded fonts
subset = not options['full_fonts']
pdf_fonts = build_fonts_dictionary(
pdf, document.fonts, compress, subset, options)
pdf.add_object(pdf_fonts)
if 'AcroForm' in pdf.catalog:
# Include Dingbats for forms
dingbats = pydyf.Dictionary({
'Type': '/Font',
'Subtype': '/Type1',
'BaseFont': '/ZapfDingbats',
})
pdf.add_object(dingbats)
pdf_fonts['ZaDb'] = dingbats.reference
resources['Font'] = pdf_fonts.reference
_use_references(pdf, resources, images)
# Anchors
if pdf_names:
# Anchors are name trees that have to be sorted
name_array = pydyf.Array()
for anchor in sorted(pdf_names):
name_array.append(pydyf.String(anchor[0]))
name_array.append(anchor[1])
dests = pydyf.Dictionary({'Names': name_array})
if 'Names' not in pdf.catalog:
pdf.catalog['Names'] = pydyf.Dictionary()
pdf.catalog['Names']['Dests'] = dests
# Apply PDF variants functions
if variant:
variant_function(
pdf, metadata, document, page_streams, attachments, compress)
return pdf

View File

@@ -0,0 +1,384 @@
"""Insert anchors, links, bookmarks and inputs in PDFs."""
import io
import mimetypes
from hashlib import md5
from os.path import basename
from urllib.parse import unquote, urlsplit
import pydyf
from .. import Attachment
from ..logger import LOGGER
from ..text.ffi import ffi, gobject, pango
from ..text.fonts import get_font_description
from ..urls import URLFetchingError
def add_links(links_and_anchors, matrix, pdf, page, names, mark):
"""Include hyperlinks in given PDF page."""
links, anchors = links_and_anchors
for link_type, link_target, rectangle, box in links:
x1, y1 = matrix.transform_point(*rectangle[:2])
x2, y2 = matrix.transform_point(*rectangle[2:])
if link_type in ('internal', 'external'):
box.link_annotation = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Link',
'Rect': pydyf.Array([x1, y1, x2, y2]),
'BS': pydyf.Dictionary({'W': 0}),
})
if mark:
box.link_annotation['Contents'] = pydyf.String(link_target)
if link_type == 'internal':
box.link_annotation['Dest'] = pydyf.String(link_target)
else:
box.link_annotation['A'] = pydyf.Dictionary({
'Type': '/Action',
'S': '/URI',
'URI': pydyf.String(link_target),
})
pdf.add_object(box.link_annotation)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(box.link_annotation.reference)
for anchor in anchors:
anchor_name, x, y = anchor
x, y = matrix.transform_point(x, y)
names.append([
anchor_name, pydyf.Array([page.reference, '/XYZ', x, y, 0])])
def add_outlines(pdf, bookmarks, parent=None):
"""Include bookmark outlines in PDF."""
count = len(bookmarks)
outlines = []
for title, (page, x, y), children, state in bookmarks:
destination = pydyf.Array((pdf.page_references[page], '/XYZ', x, y, 0))
outline = pydyf.Dictionary({
'Title': pydyf.String(title), 'Dest': destination})
pdf.add_object(outline)
children_outlines, children_count = add_outlines(
pdf, children, parent=outline)
outline['Count'] = children_count
if state == 'closed':
outline['Count'] *= -1
else:
count += children_count
if outlines:
outline['Prev'] = outlines[-1].reference
outlines[-1]['Next'] = outline.reference
if children_outlines:
outline['First'] = children_outlines[0].reference
outline['Last'] = children_outlines[-1].reference
if parent is not None:
outline['Parent'] = parent.reference
outlines.append(outline)
if parent is None and outlines:
outlines_dictionary = pydyf.Dictionary({
'Count': count,
'First': outlines[0].reference,
'Last': outlines[-1].reference,
})
pdf.add_object(outlines_dictionary)
for outline in outlines:
outline['Parent'] = outlines_dictionary.reference
pdf.catalog['Outlines'] = outlines_dictionary.reference
return outlines, count
def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map,
compress):
"""Include form inputs in PDF."""
if not inputs:
return
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
if 'AcroForm' not in pdf.catalog:
pdf.catalog['AcroForm'] = pydyf.Dictionary({
'Fields': pydyf.Array(),
'DR': resources.reference,
'NeedAppearances': 'true',
})
page_reference = page['Contents'].split()[0]
context = ffi.gc(
pango.pango_font_map_create_context(font_map),
gobject.g_object_unref)
for i, (element, style, rectangle) in enumerate(inputs):
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
input_type = element.attrib.get('type')
default_name = f'unknown-{page_reference.decode()}-{i}'
input_name = element.attrib.get('name', default_name)
# TODO: where does this 0.75 scale come from?
font_size = style['font_size'] * 0.75
field_stream = pydyf.Stream(compress=compress)
field_stream.set_color_rgb(*style['color'][:3])
if input_type == 'checkbox':
# Checkboxes
width = rectangle[2] - rectangle[0]
height = rectangle[1] - rectangle[3]
checked_stream = pydyf.Stream(extra={
'Resources': resources.reference,
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array((0, 0, width, height)),
}, compress=compress)
checked_stream.push_state()
checked_stream.begin_text()
checked_stream.set_color_rgb(*style['color'][:3])
checked_stream.set_font_size('ZaDb', font_size)
# Center (lets assume that Dingbats check has a 0.8em size)
x = (width - font_size * 0.8) / 2
y = (height - font_size * 0.8) / 2
checked_stream.move_text_to(x, y)
checked_stream.show_text_string('4')
checked_stream.end_text()
checked_stream.pop_state()
pdf.add_object(checked_stream)
checked = 'checked' in element.attrib
field_stream.set_font_size('ZaDb', font_size)
field = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Widget',
'Rect': pydyf.Array(rectangle),
'FT': '/Btn',
'F': 1 << (3 - 1), # Print flag
'P': page.reference,
'T': pydyf.String(input_name),
'V': '/Yes' if checked else '/Off',
'AP': pydyf.Dictionary({'N': pydyf.Dictionary({
'Yes': checked_stream.reference,
})}),
'AS': '/Yes' if checked else '/Off',
'DA': pydyf.String(b' '.join(field_stream.stream)),
})
elif element.tag == 'select':
# Select fields
font_description = get_font_description(style)
font = pango.pango_font_map_load_font(
font_map, context, font_description)
font = stream.add_font(font)
font.used_in_forms = True
field_stream.set_font_size(font.hash, font_size)
options = []
selected_values = []
for option in element:
value = pydyf.String(option.attrib.get('value', ''))
text = pydyf.String(option.text or "")
options.append(pydyf.Array([value, text]))
if 'selected' in option.attrib:
selected_values.append(value)
field = pydyf.Dictionary({
'DA': pydyf.String(b' '.join(field_stream.stream)),
'F': 1 << (3 - 1), # Print flag
'FT': '/Ch',
'Opt': pydyf.Array(options),
'P': page.reference,
'Rect': pydyf.Array(rectangle),
'Subtype': '/Widget',
'T': pydyf.String(input_name),
'Type': '/Annot',
})
if 'multiple' in element.attrib:
field['Ff'] = 1 << (22 - 1)
field['V'] = pydyf.Array(selected_values)
else:
field['Ff'] = 1 << (18 - 1)
field['V'] = (
selected_values[-1] if selected_values
else pydyf.String(''))
else:
# Text, password, textarea, files, and unknown
font_description = get_font_description(style)
font = pango.pango_font_map_load_font(
font_map, context, font_description)
font = stream.add_font(font)
font.used_in_forms = True
field_stream.set_font_size(font.hash, font_size)
value = (
element.text if element.tag == 'textarea'
else element.attrib.get('value', ''))
field = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Widget',
'Rect': pydyf.Array(rectangle),
'FT': '/Tx',
'F': 1 << (3 - 1), # Print flag
'P': page.reference,
'T': pydyf.String(input_name),
'V': pydyf.String(value or ''),
'DA': pydyf.String(b' '.join(field_stream.stream)),
})
if element.tag == 'textarea':
field['Ff'] = 1 << (13 - 1)
elif input_type == 'password':
field['Ff'] = 1 << (14 - 1)
elif input_type == 'file':
field['Ff'] = 1 << (21 - 1)
maxlength = element.get('maxlength')
if maxlength and maxlength.isdigit():
field['MaxLen'] = element.get('maxlength')
pdf.add_object(field)
page['Annots'].append(field.reference)
pdf.catalog['AcroForm']['Fields'].append(field.reference)
def add_annotations(links, matrix, document, pdf, page, annot_files, compress):
"""Include annotations in PDF."""
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
# other annotations and fails completely for transformed (CSS) or
# complex link shapes (area). It would be better to use /AP for all
# links and coalesce link shapes that originate from the same HTML
# link. This would give a feeling similiar to what browsers do with
# links that span multiple lines.
for link_type, annot_target, rectangle, _ in links:
if link_type != 'attachment':
continue
if annot_target not in annot_files:
# A single link can be split in multiple regions. We don't want
# to embed a file multiple times of course, so keep a reference
# to every embedded URL and reuse the object number.
# TODO: Use the title attribute as description. The comment
# above about multiple regions won't always be correct, because
# two links might have the same href, but different titles.
attachment = Attachment(
url=annot_target, url_fetcher=document.url_fetcher)
annot_files[annot_target] = write_pdf_attachment(
pdf, attachment, compress)
annot_file = annot_files[annot_target]
if annot_file is None:
continue
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
stream = pydyf.Stream([], {
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
}, compress)
pdf.add_object(stream)
annot = pydyf.Dictionary({
'Type': '/Annot',
'Rect': pydyf.Array(rectangle),
'Subtype': '/FileAttachment',
'T': pydyf.String(),
'FS': annot_file.reference,
'AP': pydyf.Dictionary({'N': stream.reference}),
'AS': '/N',
})
pdf.add_object(annot)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(annot.reference)
def write_pdf_attachment(pdf, attachment, compress):
"""Write an attachment to the PDF stream."""
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
url = None
uncompressed_length = 0
stream = b''
try:
with attachment.source as (_, source, url, _):
if isinstance(source, str):
source = source.encode()
if isinstance(source, bytes):
source = io.BytesIO(source)
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
stream += data
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
attachment.md5 = md5(stream, usedforsecurity=False).hexdigest()
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename and MIME type.
if url and urlsplit(url).path:
filename = basename(unquote(urlsplit(url).path))
else:
filename = 'attachment.bin'
mime_type = mimetypes.guess_type(filename, strict=False)[0]
if not mime_type:
mime_type = 'application/octet-stream'
creation = pydyf.String(attachment.created.strftime('D:%Y%m%d%H%M%SZ'))
mod = pydyf.String(attachment.modified.strftime('D:%Y%m%d%H%M%SZ'))
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Subtype': f'/{mime_type.replace("/", "#2f")}',
'Params': pydyf.Dictionary({
'CheckSum': f'<{attachment.md5}>',
'Size': uncompressed_length,
'CreationDate': creation,
'ModDate': mod,
})
})
file_stream = pydyf.Stream([stream], file_extra, compress=compress)
pdf.add_object(file_stream)
pdf_attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(pdf_attachment)
return pdf_attachment
def resolve_links(pages):
"""Resolve internal hyperlinks.
Links to a missing anchor are removed with a warning.
If multiple anchors have the same name, the first one is used.
:returns:
A generator yielding lists (one per page) like :attr:`Page.links`,
except that ``target`` for internal hyperlinks is
``(page_number, x, y)`` instead of an anchor name.
The page number is a 0-based index into the :attr:`pages` list,
and ``x, y`` are in CSS pixels from the top-left of the page.
"""
anchors = set()
paged_anchors = []
for i, page in enumerate(pages):
paged_anchors.append([])
for anchor_name, (point_x, point_y) in page.anchors.items():
if anchor_name not in anchors:
paged_anchors[-1].append((anchor_name, point_x, point_y))
anchors.add(anchor_name)
for page in pages:
page_links = []
for link in page.links:
link_type, anchor_name, _, _ = link
if link_type == 'internal':
if anchor_name not in anchors:
LOGGER.error(
'No anchor #%s for internal URI reference',
anchor_name)
else:
page_links.append(link)
else:
# External link
page_links.append(link)
yield page_links, paged_anchors.pop(0)

View File

@@ -0,0 +1,318 @@
"""Fonts integration in PDF."""
from math import ceil
import pydyf
from ..logger import LOGGER
def build_fonts_dictionary(pdf, fonts, compress_pdf, subset, options):
pdf_fonts = pydyf.Dictionary()
fonts_by_file_hash = {}
for font in fonts.values():
fonts_by_file_hash.setdefault(font.hash, []).append(font)
font_references_by_file_hash = {}
for file_hash, file_fonts in fonts_by_file_hash.items():
# TODO: find why we can have multiple fonts for one font file
font = file_fonts[0]
if font.bitmap:
continue
# Clean font, optimize and handle emojis
cmap = {}
if subset and not font.used_in_forms:
for file_font in file_fonts:
cmap = {**cmap, **file_font.cmap}
font.clean(cmap, options['hinting'])
# Include font
if font.type == 'otf':
font_extra = pydyf.Dictionary({'Subtype': '/OpenType'})
else:
font_extra = pydyf.Dictionary({'Length1': len(font.file_content)})
font_stream = pydyf.Stream(
[font.file_content], font_extra, compress=compress_pdf)
pdf.add_object(font_stream)
font_references_by_file_hash[file_hash] = font_stream.reference
for font in fonts.values():
if not font.ttfont or (subset and not font.used_in_forms):
# Only store widths and map for used glyphs
font_widths = font.widths
cmap = font.cmap
else:
# Store width and Unicode map for all glyphs
font_widths, cmap = {}, {}
for letter, key in font.ttfont.getBestCmap().items():
glyph = font.ttfont.getGlyphID(key)
if glyph not in cmap:
cmap[glyph] = chr(letter)
width = font.ttfont.getGlyphSet()[key].width
font_widths[glyph] = width * 1000 / font.upem
max_x = max(font_widths.values()) if font_widths else 0
bbox = (0, font.descent, max_x, font.ascent)
widths = pydyf.Array()
for i in sorted(font_widths):
if i - 1 not in font_widths:
widths.append(i)
current_widths = pydyf.Array()
widths.append(current_widths)
current_widths.append(font_widths[i])
font_file = f'FontFile{3 if font.type == "otf" else 2}'
to_unicode = pydyf.Stream([
b'/CIDInit /ProcSet findresource begin',
b'12 dict begin',
b'begincmap',
b'/CIDSystemInfo',
b'<< /Registry (Adobe)',
b'/Ordering (UCS)',
b'/Supplement 0',
b'>> def',
b'/CMapName /Adobe-Identity-UCS def',
b'/CMapType 2 def',
b'1 begincodespacerange',
b'<0000> <ffff>',
b'endcodespacerange',
f'{len(cmap)} beginbfchar'.encode()], compress=compress_pdf)
for glyph, text in cmap.items():
unicode_codepoints = ''.join(
f'{letter.encode("utf-16-be").hex()}' for letter in text)
to_unicode.stream.append(
f'<{glyph:04x}> <{unicode_codepoints}>'.encode())
to_unicode.stream.extend([
b'endbfchar',
b'endcmap',
b'CMapName currentdict /CMap defineresource pop',
b'end',
b'end'])
pdf.add_object(to_unicode)
font_dictionary = pydyf.Dictionary({
'Type': '/Font',
'Subtype': f'/Type{3 if font.bitmap else 0}',
'BaseFont': font.name,
'ToUnicode': to_unicode.reference,
})
if font.bitmap:
_build_bitmap_font_dictionary(
font_dictionary, pdf, font, widths, compress_pdf, subset)
else:
flags = font.flags
if len(widths) > 1 and len(set(font.widths.values())) == 1:
flags += 2 ** (1 - 1) # FixedPitch
font_descriptor = pydyf.Dictionary({
'Type': '/FontDescriptor',
'FontName': font.name,
'FontFamily': pydyf.String(font.family),
'Flags': flags,
'FontBBox': pydyf.Array(bbox),
'ItalicAngle': font.italic_angle,
'Ascent': font.ascent,
'Descent': font.descent,
'CapHeight': bbox[3],
'StemV': font.stemv,
'StemH': font.stemh,
font_file: font_references_by_file_hash[font.hash],
})
if str(options['pdf_version']) <= '1.4': # Cast for bytes and None
cids = sorted(font.widths)
padded_width = int(ceil((cids[-1] + 1) / 8))
bits = ['0'] * padded_width * 8
for cid in cids:
bits[cid] = '1'
stream = pydyf.Stream(
(int(''.join(bits), 2).to_bytes(padded_width, 'big'),),
compress=compress_pdf)
pdf.add_object(stream)
font_descriptor['CIDSet'] = stream.reference
if font.type == 'otf':
font_descriptor['Subtype'] = '/OpenType'
pdf.add_object(font_descriptor)
subfont_dictionary = pydyf.Dictionary({
'Type': '/Font',
'Subtype': f'/CIDFontType{0 if font.type == "otf" else 2}',
'BaseFont': font.name,
'CIDSystemInfo': pydyf.Dictionary({
'Registry': pydyf.String('Adobe'),
'Ordering': pydyf.String('Identity'),
'Supplement': 0,
}),
'CIDToGIDMap': '/Identity',
'W': widths,
'FontDescriptor': font_descriptor.reference,
})
pdf.add_object(subfont_dictionary)
font_dictionary['Encoding'] = '/Identity-H'
font_dictionary['DescendantFonts'] = pydyf.Array(
[subfont_dictionary.reference])
pdf.add_object(font_dictionary)
pdf_fonts[font.hash] = font_dictionary.reference
return pdf_fonts
def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
compress_pdf, subset):
# https://docs.microsoft.com/typography/opentype/spec/ebdt
font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1])
font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0])
if subset:
chars = tuple(sorted(font.cmap))
else:
chars = tuple(range(256))
first, last = chars[0], chars[-1]
font_dictionary['FirstChar'] = first
font_dictionary['LastChar'] = last
differences = []
for index, index_widths in zip(widths[::2], widths[1::2]):
differences.append(index)
for i in range(len(index_widths)):
if i + index in chars:
differences.append(f'/{i + index}')
font_dictionary['Encoding'] = pydyf.Dictionary({
'Type': '/Encoding',
'Differences': pydyf.Array(differences),
})
char_procs = pydyf.Dictionary({})
font_glyphs = font.ttfont['EBDT'].strikeData[0]
widths = [0] * (last - first + 1)
glyphs_info = {}
for key, glyph in font_glyphs.items():
glyph_format = glyph.getFormat()
glyph_id = font.ttfont.getGlyphID(key)
# Get and store glyph metrics
if glyph_format == 5:
data = glyph.data
subtables = font.ttfont['EBLC'].strikes[0].indexSubTables
for subtable in subtables:
first_index = subtable.firstGlyphIndex
last_index = subtable.lastGlyphIndex
if first_index <= glyph_id <= last_index:
height = subtable.metrics.height
advance = width = subtable.metrics.width
bearing_x = subtable.metrics.horiBearingX
bearing_y = subtable.metrics.horiBearingY
break
else:
LOGGER.warning(f'Unknown bitmap metrics for glyph: {glyph_id}')
continue
else:
data_start = 5 if glyph_format in (1, 2, 8) else 8
data = glyph.data[data_start:]
height, width = glyph.data[0:2]
bearing_x = int.from_bytes(glyph.data[2:3], 'big', signed=True)
bearing_y = int.from_bytes(glyph.data[3:4], 'big', signed=True)
advance = glyph.data[4]
position_y = bearing_y - height
if glyph_id in chars:
widths[glyph_id - first] = advance
stride = ceil(width / 8)
glyph_info = glyphs_info[glyph_id] = {
'width': width,
'height': height,
'x': bearing_x,
'y': position_y,
'stride': stride,
'bitmap': None,
'subglyphs': None,
}
# Decode bitmaps
if 0 in (width, height) or not data:
glyph_info['bitmap'] = b''
elif glyph_format in (1, 6):
glyph_info['bitmap'] = data
elif glyph_format in (2, 5, 7):
padding = (8 - (width % 8)) % 8
bits = bin(int(data.hex(), 16))[2:]
bits = bits.zfill(8 * len(data))
bitmap_bits = ''.join(
bits[i * width:(i + 1) * width] + padding * '0'
for i in range(height))
glyph_info['bitmap'] = int(bitmap_bits, 2).to_bytes(
height * stride, 'big')
elif glyph_format in (8, 9):
subglyphs = glyph_info['subglyphs'] = []
i = 0 if glyph_format == 9 else 1
number_of_components = int.from_bytes(data[i:i+2], 'big')
for j in range(number_of_components):
index = (i + 2) + (j * 4)
subglyph_id = int.from_bytes(data[index:index+2], 'big')
x = int.from_bytes(data[index+2:index+3], 'big', signed=True)
y = int.from_bytes(data[index+3:index+4], 'big', signed=True)
subglyphs.append({'id': subglyph_id, 'x': x, 'y': y})
else: # pragma: no cover
LOGGER.warning(f'Unsupported bitmap glyph format: {glyph_format}')
glyph_info['bitmap'] = bytes(height * stride)
for glyph_id, glyph_info in glyphs_info.items():
# Dont store glyph not in cmap
if glyph_id not in chars:
continue
# Draw glyph
stride = glyph_info['stride']
width = glyph_info['width']
height = glyph_info['height']
x = glyph_info['x']
y = glyph_info['y']
if glyph_info['bitmap'] is None:
length = height * stride
bitmap_int = int.from_bytes(bytes(length), 'big')
for subglyph in glyph_info['subglyphs']:
sub_x = subglyph['x']
sub_y = subglyph['y']
sub_id = subglyph['id']
if sub_id not in glyphs_info:
LOGGER.warning(f'Unknown subglyph: {sub_id}')
continue
subglyph = glyphs_info[sub_id]
if subglyph['bitmap'] is None:
# TODO: support subglyph in subglyph
LOGGER.warning(
f'Unsupported subglyph in subglyph: {sub_id}')
continue
for row_y in range(subglyph['height']):
row_slice = slice(
row_y * subglyph['stride'],
(row_y + 1) * subglyph['stride'])
row = subglyph['bitmap'][row_slice]
row_int = int.from_bytes(row, 'big')
shift = stride * 8 * (height - sub_y - row_y - 1)
stride_difference = stride - subglyph['stride']
if stride_difference > 0:
row_int <<= stride_difference * 8
elif stride_difference < 0:
row_int >>= -stride_difference * 8
if sub_x > 0:
row_int >>= sub_x
elif sub_x < 0:
row_int <<= -sub_x
row_int %= 1 << stride * 8
row_int <<= shift
bitmap_int |= row_int
bitmap = bitmap_int.to_bytes(length, 'big')
else:
bitmap = glyph_info['bitmap']
bitmap_stream = pydyf.Stream([
b'0 0 d0',
f'{width} 0 0 {height} {x} {y} cm'.encode(),
b'BI',
b'/IM true',
b'/W', width,
b'/H', height,
b'/BPC 1',
b'/D [1 0]',
b'ID', bitmap, b'EI'
], compress=compress_pdf)
pdf.add_object(bitmap_stream)
char_procs[glyph_id] = bitmap_stream.reference
pdf.add_object(char_procs)
font_dictionary['Widths'] = pydyf.Array(widths)
font_dictionary['CharProcs'] = char_procs.reference

View File

@@ -0,0 +1,92 @@
"""PDF metadata stream generation."""
from xml.etree.ElementTree import Element, SubElement, register_namespace, tostring
import pydyf
from .. import __version__
# XML namespaces used for metadata
NS = {
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'dc': 'http://purl.org/dc/elements/1.1/',
'xmp': 'http://ns.adobe.com/xap/1.0/',
'pdf': 'http://ns.adobe.com/pdf/1.3/',
'pdfaid': 'http://www.aiim.org/pdfa/ns/id/',
'pdfuaid': 'http://www.aiim.org/pdfua/ns/id/',
}
for key, value in NS.items():
register_namespace(key, value)
def add_metadata(pdf, metadata, variant, version, conformance, compress):
"""Add PDF stream of metadata.
Described in ISO-32000-1:2008, 14.3.2.
"""
# Add metadata
namespace = f'pdf{variant}id'
rdf = Element(f'{{{NS["rdf"]}}}RDF')
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element.attrib[f'{{{NS[namespace]}}}part'] = str(version)
if conformance:
element.attrib[f'{{{NS[namespace]}}}conformance'] = conformance
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element.attrib[f'{{{NS["pdf"]}}}Producer'] = f'WeasyPrint {__version__}'
if metadata.title:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["dc"]}}}title')
element = SubElement(element, f'{{{NS["rdf"]}}}Alt')
element = SubElement(element, f'{{{NS["rdf"]}}}li')
element.attrib['xml:lang'] = 'x-default'
element.text = metadata.title
if metadata.authors:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["dc"]}}}creator')
element = SubElement(element, f'{{{NS["rdf"]}}}Seq')
for author in metadata.authors:
author_element = SubElement(element, f'{{{NS["rdf"]}}}li')
author_element.text = author
if metadata.description:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["dc"]}}}subject')
element = SubElement(element, f'{{{NS["rdf"]}}}Bag')
element = SubElement(element, f'{{{NS["rdf"]}}}li')
element.text = metadata.description
if metadata.keywords:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["pdf"]}}}Keywords')
element.text = ', '.join(metadata.keywords)
if metadata.generator:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["xmp"]}}}CreatorTool')
element.text = metadata.generator
if metadata.created:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["xmp"]}}}CreateDate')
element.text = metadata.created
if metadata.modified:
element = SubElement(rdf, f'{{{NS["rdf"]}}}Description')
element.attrib[f'{{{NS["rdf"]}}}about'] = ''
element = SubElement(element, f'{{{NS["xmp"]}}}ModifyDate')
element.text = metadata.modified
xml = tostring(rdf, encoding='utf-8')
header = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>'
footer = b'<?xpacket end="r"?>'
stream_content = b'\n'.join((header, xml, footer))
extra = {'Type': '/Metadata', 'Subtype': '/XML'}
metadata = pydyf.Stream([stream_content], extra, compress)
pdf.add_object(metadata)
pdf.catalog['Metadata'] = metadata.reference

View File

@@ -0,0 +1,117 @@
"""PDF/A generation."""
try:
# Available in Python 3.9+
from importlib.resources import files
except ImportError:
# Deprecated in Python 3.11+
from importlib.resources import read_binary
else:
def read_binary(package, resource):
return (files(package) / resource).read_bytes()
from functools import partial
import pydyf
from .metadata import add_metadata
def pdfa(pdf, metadata, document, page_streams, attachments, compress,
version, variant):
"""Set metadata for PDF/A documents."""
# Add ICC profile.
profile = pydyf.Stream(
[read_binary(__package__, 'sRGB2014.icc')],
pydyf.Dictionary({'N': 3, 'Alternate': '/DeviceRGB'}),
compress=compress)
pdf.add_object(profile)
pdf.catalog['OutputIntents'] = pydyf.Array([
pydyf.Dictionary({
'Type': '/OutputIntent',
'S': '/GTS_PDFA1',
'OutputConditionIdentifier': pydyf.String('sRGB IEC61966-2.1'),
'DestOutputProfile': profile.reference,
}),
])
# Handle attachments.
if version == 1:
# Remove embedded files dictionary.
if 'Names' in pdf.catalog and 'EmbeddedFiles' in pdf.catalog['Names']:
del pdf.catalog['Names']['EmbeddedFiles']
if version <= 2:
# Remove attachments.
for pdf_object in pdf.objects:
if not isinstance(pdf_object, dict):
continue
if pdf_object.get('Type') != '/Filespec':
continue
reference = int(pdf_object['EF']['F'].split()[0])
stream = pdf.objects[reference]
# Remove all attachments for version 1.
# Remove non-PDF attachments for version 2.
# TODO: check that PDFs are actually PDF/A-2+ files.
if version == 1 or stream.extra['Subtype'] != '/application#2fpdf':
del pdf_object['EF']
if version >= 3:
# Add AF for attachments.
relationships = {
f'<{attachment.md5}>': attachment.relationship
for attachment in attachments if attachment.md5}
pdf_attachments = []
if 'Names' in pdf.catalog and 'EmbeddedFiles' in pdf.catalog['Names']:
reference = int(pdf.catalog['Names']['EmbeddedFiles'].split()[0])
names = pdf.objects[reference]
for name in names['Names'][1::2]:
pdf_attachments.append(name)
for pdf_object in pdf.objects:
if not isinstance(pdf_object, dict):
continue
if pdf_object.get('Type') != '/Filespec':
continue
reference = int(pdf_object['EF']['F'].split()[0])
checksum = pdf.objects[reference].extra['Params']['CheckSum']
relationship = relationships.get(checksum, 'Unspecified')
pdf_object['AFRelationship'] = f'/{relationship}'
pdf_attachments.append(pdf_object.reference)
if pdf_attachments:
if 'AF' not in pdf.catalog:
pdf.catalog['AF'] = pydyf.Array()
pdf.catalog['AF'].extend(pdf_attachments)
# Print annotations.
for pdf_object in pdf.objects:
if isinstance(pdf_object, dict) and pdf_object.get('Type') == '/Annot':
pdf_object['F'] = 2 ** (3 - 1)
# Common PDF metadata stream.
if version == 1:
# Metadata compression is forbidden for version 1.
compress = False
add_metadata(pdf, metadata, 'a', version, variant, compress)
VARIANTS = {
'pdf/a-1b': (
partial(pdfa, version=1, variant='B'),
{'version': '1.4', 'identifier': True}),
'pdf/a-2b': (
partial(pdfa, version=2, variant='B'),
{'version': '1.7', 'identifier': True}),
'pdf/a-3b': (
partial(pdfa, version=3, variant='B'),
{'version': '1.7', 'identifier': True}),
'pdf/a-4b': (
partial(pdfa, version=4, variant='B'),
{'version': '2.0', 'identifier': True}),
'pdf/a-2u': (
partial(pdfa, version=2, variant='U'),
{'version': '1.7', 'identifier': True}),
'pdf/a-3u': (
partial(pdfa, version=3, variant='U'),
{'version': '1.7', 'identifier': True}),
'pdf/a-4u': (
partial(pdfa, version=4, variant='U'),
{'version': '2.0', 'identifier': True}),
}

View File

@@ -0,0 +1,125 @@
"""PDF/UA generation."""
import pydyf
from .metadata import add_metadata
def pdfua(pdf, metadata, document, page_streams, attachments, compress):
"""Set metadata for PDF/UA documents."""
# Structure for PDF tagging
content_mapping = pydyf.Dictionary({})
pdf.add_object(content_mapping)
structure_root = pydyf.Dictionary({
'Type': '/StructTreeRoot',
'ParentTree': content_mapping.reference,
})
pdf.add_object(structure_root)
structure_document = pydyf.Dictionary({
'Type': '/StructElem',
'S': '/Document',
'P': structure_root.reference,
})
pdf.add_object(structure_document)
structure_root['K'] = pydyf.Array([structure_document.reference])
pdf.catalog['StructTreeRoot'] = structure_root.reference
document_children = []
content_mapping['Nums'] = pydyf.Array()
links = []
for page_number, page_stream in enumerate(page_streams):
structure = {}
document.build_element_structure(structure)
parents = [None] * len(page_stream.marked)
for mcid, (key, box) in enumerate(page_stream.marked):
# Build structure elements
kids = [mcid]
if key == 'Link':
object_reference = pydyf.Dictionary({
'Type': '/OBJR',
'Obj': box.link_annotation.reference,
'Pg': pdf.page_references[page_number],
})
pdf.add_object(object_reference)
links.append((object_reference.reference, box.link_annotation))
etree_element = box.element
child_structure_data_element = None
while True:
if etree_element is None:
structure_data = structure.setdefault(
box, {'parent': None})
else:
structure_data = structure[etree_element]
new_element = 'element' not in structure_data
if new_element:
child = structure_data['element'] = pydyf.Dictionary({
'Type': '/StructElem',
'S': f'/{key}',
'K': pydyf.Array(kids),
'Pg': pdf.page_references[page_number],
})
pdf.add_object(child)
if key == 'LI':
if etree_element.tag == 'dt':
sub_key = 'Lbl'
else:
sub_key = 'LBody'
real_child = pydyf.Dictionary({
'Type': '/StructElem',
'S': f'/{sub_key}',
'K': pydyf.Array(kids),
'Pg': pdf.page_references[page_number],
'P': child.reference,
})
pdf.add_object(real_child)
for kid in kids:
if isinstance(kid, int):
parents[kid] = real_child.reference
child['K'] = pydyf.Array([real_child.reference])
structure_data['element'] = real_child
else:
for kid in kids:
if isinstance(kid, int):
parents[kid] = child.reference
else:
child = structure_data['element']
child['K'].extend(kids)
for kid in kids:
if isinstance(kid, int):
parents[kid] = child.reference
kid = child.reference
if child_structure_data_element is not None:
child_structure_data_element['P'] = kid
if not new_element:
break
kids = [kid]
child_structure_data_element = child
if structure_data['parent'] is None:
child['P'] = structure_document.reference
document_children.append(child.reference)
break
else:
etree_element = structure_data['parent']
key = page_stream.get_marked_content_tag(etree_element.tag)
content_mapping['Nums'].append(page_number)
content_mapping['Nums'].append(pydyf.Array(parents))
structure_document['K'] = pydyf.Array(document_children)
for i, (link, annotation) in enumerate(links, start=page_number + 1):
content_mapping['Nums'].append(i)
content_mapping['Nums'].append(link)
annotation['StructParent'] = i
annotation['F'] = 2 ** (2 - 1)
# Common PDF metadata stream
add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress)
# PDF document extra metadata
if 'Lang' not in pdf.catalog:
pdf.catalog['Lang'] = pydyf.String()
pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({
'DisplayDocTitle': 'true',
})
pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
VARIANTS = {'pdf/ua-1': (pdfua, {'mark': True})}

View File

@@ -0,0 +1,489 @@
"""PDF stream."""
import io
from hashlib import md5
import pydyf
from fontTools import subset
from fontTools.ttLib import TTFont, TTLibError, ttFont
from fontTools.varLib.mutator import instantiateVariableFont
from ..logger import LOGGER
from ..matrix import Matrix
from ..text.constants import PANGO_STRETCH_PERCENT
from ..text.ffi import ffi, harfbuzz, pango, units_to_double
from ..text.fonts import get_hb_object_data, get_pango_font_hb_face, get_pango_font_key
class Font:
def __init__(self, pango_font):
self.hb_font = pango.pango_font_get_hb_font(pango_font)
self.hb_face = get_pango_font_hb_face(pango_font)
self.file_content = get_hb_object_data(self.hb_face)
self.index = harfbuzz.hb_face_get_index(self.hb_face)
pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
self.description = description = ffi.gc(
pango.pango_font_describe(pango_font),
pango.pango_font_description_free)
self.font_size = pango.pango_font_description_get_size(description)
self.style = pango.pango_font_description_get_style(description)
self.family = ffi.string(
pango.pango_font_description_get_family(description))
self.variations = {}
variations = pango.pango_font_description_get_variations(
self.description)
if variations != ffi.NULL:
self.variations = {
part.split('=')[0]: float(part.split('=')[1])
for part in ffi.string(variations).decode().split(',')}
if 'wght' in self.variations:
pango.pango_font_description_set_weight(
self.description, int(round(self.variations['wght'])))
if self.variations.get('ital'):
pango.pango_font_description_set_style(
self.description, pango.PANGO_STYLE_ITALIC)
elif self.variations.get('slnt'):
pango.pango_font_description_set_style(
self.description, pango.PANGO_STYLE_OBLIQUE)
if 'wdth' in self.variations:
stretch = min(
PANGO_STRETCH_PERCENT.items(),
key=lambda item: abs(item[0] - self.variations['wdth']))[1]
pango.pango_font_description_set_stretch(self.description, stretch)
description_string = ffi.string(
pango.pango_font_description_to_string(description))
# Never use the built-in hash function here: its not stable
self.hash = ''.join(
chr(65 + letter % 26) for letter
in md5(description_string, usedforsecurity=False).digest()[:6])
# Name
fields = description_string.split(b' ')
if fields and b'=' in fields[-1]:
fields.pop() # Remove variations
if fields:
fields.pop() # Remove font size
else:
fields = [b'Unknown']
self.name = b'/' + self.hash.encode() + b'+' + b'-'.join(fields)
# Ascent & descent
if self.font_size:
self.ascent = int(
pango.pango_font_metrics_get_ascent(pango_metrics) /
self.font_size * 1000)
self.descent = -int(
pango.pango_font_metrics_get_descent(pango_metrics) /
self.font_size * 1000)
else:
self.ascent = self.descent = 0
# Fonttools
full_font = io.BytesIO(self.file_content)
try:
self.ttfont = TTFont(full_font, fontNumber=self.index)
except Exception:
LOGGER.warning('Unable to read font')
self.ttfont = None
self.bitmap = False
else:
self.bitmap = (
'EBDT' in self.ttfont and 'EBLC' in self.ttfont and (
'glyf' not in self.ttfont or not self.ttfont['glyf'].glyphs))
# Various properties
self.italic_angle = 0 # TODO: this should be different
self.upem = harfbuzz.hb_face_get_upem(self.hb_face)
self.png = harfbuzz.hb_ot_color_has_png(self.hb_face)
self.svg = harfbuzz.hb_ot_color_has_svg(self.hb_face)
self.stemv = 80
self.stemh = 80
self.widths = {}
self.cmap = {}
self.used_in_forms = False
# Font flags
self.flags = 2 ** (3 - 1) # Symbolic, custom character set
if self.style:
self.flags += 2 ** (7 - 1) # Italic
if b'Serif' in fields:
self.flags += 2 ** (2 - 1) # Serif
def clean(self, cmap, hinting):
if self.ttfont is None:
return
# Subset font
if cmap:
optimized_font = io.BytesIO()
options = subset.Options(
retain_gids=True, passthrough_tables=True,
ignore_missing_glyphs=True, hinting=hinting,
desubroutinize=True)
options.drop_tables += ['GSUB', 'GPOS', 'SVG']
subsetter = subset.Subsetter(options)
subsetter.populate(gids=cmap)
try:
subsetter.subset(self.ttfont)
except TTLibError:
LOGGER.warning('Unable to optimize font')
else:
self.ttfont.save(optimized_font)
self.file_content = optimized_font.getvalue()
# Transform variable into static font
if 'fvar' in self.ttfont:
if 'wght' not in self.variations:
weight = pango.pango_font_description_get_weight(
self.description)
self.variations['wght'] = weight
if 'opsz' not in self.variations:
self.variations['opsz'] = units_to_double(self.font_size)
if 'slnt' not in self.variations:
slnt = 0
if self.style == 1:
for axe in self.ttfont['fvar'].axes:
if axe.axisTag == 'slnt':
if axe.maxValue == 0:
slnt = axe.minValue
else:
slnt = axe.maxValue
break
self.variations['slnt'] = slnt
if 'ital' not in self.variations:
self.variations['ital'] = int(self.style == 2)
partial_font = io.BytesIO()
try:
ttfont = instantiateVariableFont(self.ttfont, self.variations)
for key, (advance, bearing) in ttfont['hmtx'].metrics.items():
if advance < 0:
ttfont['hmtx'].metrics[key] = (0, bearing)
ttfont.save(partial_font)
except Exception:
LOGGER.warning('Unable to mutate variable font')
else:
self.ttfont = ttfont
self.file_content = partial_font.getvalue()
if not (self.png or self.svg):
return
try:
# Add empty glyphs instead of PNG or SVG emojis
if 'loca' not in self.ttfont or 'glyf' not in self.ttfont:
self.ttfont['loca'] = ttFont.getTableClass('loca')()
self.ttfont['glyf'] = ttFont.getTableClass('glyf')()
self.ttfont['glyf'].glyphOrder = self.ttfont.getGlyphOrder()
self.ttfont['glyf'].glyphs = {
name: ttFont.getTableModule('glyf').Glyph()
for name in self.ttfont['glyf'].glyphOrder}
else:
for glyph in self.ttfont['glyf'].glyphs:
self.ttfont['glyf'][glyph] = (
ttFont.getTableModule('glyf').Glyph())
for table_name in ('CBDT', 'CBLC', 'SVG '):
if table_name in self.ttfont:
del self.ttfont[table_name]
output_font = io.BytesIO()
self.ttfont.save(output_font)
self.file_content = output_font.getvalue()
except TTLibError:
LOGGER.warning('Unable to save emoji font')
@property
def type(self):
return 'otf' if self.file_content[:4] == b'OTTO' else 'ttf'
class Stream(pydyf.Stream):
"""PDF stream object with extra features."""
def __init__(self, fonts, page_rectangle, states, x_objects, patterns,
shadings, images, mark, *args, **kwargs):
super().__init__(*args, **kwargs)
self.page_rectangle = page_rectangle
self.marked = []
self._fonts = fonts
self._states = states
self._x_objects = x_objects
self._patterns = patterns
self._shadings = shadings
self._images = images
self._mark = mark
self._current_color = self._current_color_stroke = None
self._current_alpha = self._current_alpha_stroke = None
self._current_font = self._current_font_size = None
self._old_font = self._old_font_size = None
self._ctm_stack = [Matrix()]
# These objects are used in text.show_first_line
self.length = ffi.new('unsigned int *')
self.ink_rect = ffi.new('PangoRectangle *')
self.logical_rect = ffi.new('PangoRectangle *')
@property
def ctm(self):
return self._ctm_stack[-1]
def push_state(self):
super().push_state()
self._ctm_stack.append(self.ctm)
def pop_state(self):
if self.stream and self.stream[-1] == b'q':
self.stream.pop()
else:
super().pop_state()
self._current_color = self._current_color_stroke = None
self._current_alpha = self._current_alpha_stroke = None
self._current_font = None
self._ctm_stack.pop()
assert self._ctm_stack
def transform(self, a=1, b=0, c=0, d=1, e=0, f=0):
super().transform(a, b, c, d, e, f)
self._ctm_stack[-1] = Matrix(a, b, c, d, e, f) @ self.ctm
def begin_text(self):
if self.stream and self.stream[-1] == b'ET':
self._current_font = self._old_font
self.stream.pop()
else:
super().begin_text()
def end_text(self):
self._old_font, self._current_font = self._current_font, None
super().end_text()
def set_color_rgb(self, r, g, b, stroke=False):
if stroke:
if (r, g, b) == self._current_color_stroke:
return
else:
self._current_color_stroke = (r, g, b)
else:
if (r, g, b) == self._current_color:
return
else:
self._current_color = (r, g, b)
super().set_color_rgb(r, g, b, stroke)
def set_font_size(self, font, size):
if (font, size) == self._current_font:
return
self._current_font = (font, size)
super().set_font_size(font, size)
def set_state(self, state):
key = f's{len(self._states)}'
self._states[key] = state
super().set_state(key)
def set_alpha(self, alpha, stroke=False, fill=None):
if fill is None:
fill = not stroke
if stroke:
key = f'A{alpha}'
if key != self._current_alpha_stroke:
self._current_alpha_stroke = key
if key not in self._states:
self._states[key] = pydyf.Dictionary({'CA': alpha})
super().set_state(key)
if fill:
key = f'a{alpha}'
if key != self._current_alpha:
self._current_alpha = key
if key not in self._states:
self._states[key] = pydyf.Dictionary({'ca': alpha})
super().set_state(key)
def set_alpha_state(self, x, y, width, height):
alpha_stream = self.add_group(x, y, width, height)
alpha_state = pydyf.Dictionary({
'Type': '/ExtGState',
'SMask': pydyf.Dictionary({
'Type': '/Mask',
'S': '/Luminosity',
'G': alpha_stream,
}),
'ca': 1,
'AIS': 'false',
})
self.set_state(alpha_state)
return alpha_stream
def set_blend_mode(self, mode):
self.set_state(pydyf.Dictionary({
'Type': '/ExtGState',
'BM': f'/{mode}',
}))
def add_font(self, pango_font):
key = get_pango_font_key(pango_font)
if key not in self._fonts:
self._fonts[key] = Font(pango_font)
return self._fonts[key]
def add_group(self, x, y, width, height):
states = pydyf.Dictionary()
x_objects = pydyf.Dictionary()
patterns = pydyf.Dictionary()
shadings = pydyf.Dictionary()
resources = pydyf.Dictionary({
'ExtGState': states,
'XObject': x_objects,
'Pattern': patterns,
'Shading': shadings,
'Font': None, # Will be set by _use_references
})
extra = pydyf.Dictionary({
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array((x, y, x + width, y + height)),
'Resources': resources,
'Group': pydyf.Dictionary({
'Type': '/Group',
'S': '/Transparency',
'I': 'true',
'CS': '/DeviceRGB',
}),
})
group = Stream(
self._fonts, self.page_rectangle, states, x_objects, patterns,
shadings, self._images, self._mark, extra=extra,
compress=self.compress)
group.id = f'x{len(self._x_objects)}'
self._x_objects[group.id] = group
return group
def add_image(self, image, interpolate, ratio):
image_name = f'i{image.id}{int(interpolate)}'
self._x_objects[image_name] = None # Set by write_pdf
if image_name in self._images:
# Reuse image already stored in document
self._images[image_name]['dpi_ratios'].add(ratio)
return image_name
self._images[image_name] = {
'image': image,
'interpolate': interpolate,
'dpi_ratios': {ratio},
'x_object': None, # Set by write_pdf
}
return image_name
def add_pattern(self, x, y, width, height, repeat_width, repeat_height,
matrix):
states = pydyf.Dictionary()
x_objects = pydyf.Dictionary()
patterns = pydyf.Dictionary()
shadings = pydyf.Dictionary()
resources = pydyf.Dictionary({
'ExtGState': states,
'XObject': x_objects,
'Pattern': patterns,
'Shading': shadings,
'Font': None, # Will be set by _use_references
})
extra = pydyf.Dictionary({
'Type': '/Pattern',
'PatternType': 1,
'BBox': pydyf.Array([x, y, x + width, y + height]),
'XStep': repeat_width,
'YStep': repeat_height,
'TilingType': 1,
'PaintType': 1,
'Matrix': pydyf.Array(matrix.values),
'Resources': resources,
})
pattern = Stream(
self._fonts, self.page_rectangle, states, x_objects, patterns,
shadings, self._images, self._mark, extra=extra,
compress=self.compress)
pattern.id = f'p{len(self._patterns)}'
self._patterns[pattern.id] = pattern
return pattern
def add_shading(self, shading_type, color_space, domain, coords, extend,
function):
shading = pydyf.Dictionary({
'ShadingType': shading_type,
'ColorSpace': f'/Device{color_space}',
'Domain': pydyf.Array(domain),
'Coords': pydyf.Array(coords),
'Function': function,
})
if extend:
shading['Extend'] = pydyf.Array((b'true', b'true'))
shading.id = f's{len(self._shadings)}'
self._shadings[shading.id] = shading
return shading
def begin_marked_content(self, box, mcid=False, tag=None):
if not self._mark:
return
property_list = None
if tag is None:
tag = self.get_marked_content_tag(box.element_tag)
if mcid:
property_list = pydyf.Dictionary({'MCID': len(self.marked)})
self.marked.append((tag, box))
super().begin_marked_content(tag, property_list)
def end_marked_content(self):
if not self._mark:
return
super().end_marked_content()
@staticmethod
def create_interpolation_function(domain, c0, c1, n):
return pydyf.Dictionary({
'FunctionType': 2,
'Domain': pydyf.Array(domain),
'C0': pydyf.Array(c0),
'C1': pydyf.Array(c1),
'N': n,
})
@staticmethod
def create_stitching_function(domain, encode, bounds, sub_functions):
return pydyf.Dictionary({
'FunctionType': 3,
'Domain': pydyf.Array(domain),
'Encode': pydyf.Array(encode),
'Bounds': pydyf.Array(bounds),
'Functions': pydyf.Array(sub_functions),
})
def get_marked_content_tag(self, element_tag):
if element_tag == 'div':
return 'Div'
elif element_tag == 'span':
return 'Span'
elif element_tag == 'article':
return 'Art'
elif element_tag == 'section':
return 'Sect'
elif element_tag == 'blockquote':
return 'BlockQuote'
elif element_tag == 'p':
return 'P'
elif element_tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
return element_tag.upper()
elif element_tag in ('dl', 'ul', 'ol'):
return 'L'
elif element_tag in ('li', 'dt', 'dd'):
return 'LI'
elif element_tag == 'table':
return 'Table'
elif element_tag in ('tr', 'th', 'td'):
return element_tag.upper()
elif element_tag in ('thead', 'tbody', 'tfoot'):
return element_tag[:2].upper() + element_tag[2:]
else:
return 'NonStruct'