tractatus/pptx-env/lib/python3.12/site-packages/weasyprint/pdf/fonts.py

"""Fonts integration in PDF."""

import io
import re
from hashlib import md5
from logging import WARNING
from math import ceil

import pydyf
from fontTools import subset
from fontTools.ttLib import TTFont, TTLibError, ttFont
from fontTools.varLib.mutator import instantiateVariableFont

from ..logger import LOGGER, capture_logs
from ..text.constants import PANGO_STRETCH_PERCENT
from ..text.ffi import FROM_UNITS, ffi, harfbuzz, harfbuzz_subset, pango
from ..text.fonts import get_hb_object_data, get_pango_font_hb_face


class Font:
    def __init__(self, pango_font, description, font_size):
        self.hb_font = pango.pango_font_get_hb_font(pango_font)
        self.hb_face = get_pango_font_hb_face(pango_font)
        self.file_content = get_hb_object_data(self.hb_face)
        self.index = harfbuzz.hb_face_get_index(self.hb_face)

        self.font_size = font_size
        self.style = pango.pango_font_description_get_style(description)
        self.family = ffi.string(pango.pango_font_description_get_family(description))

        self.variations = {}
        variations = pango.pango_font_description_get_variations(description)
        if variations != ffi.NULL:
            self.variations = {
                part.split('=')[0]: float(part.split('=')[1])
                for part in ffi.string(variations).decode().split(',')}
        if weight := self.variations.get('weight'):
            self.weight = round(weight)
            pango.pango_font_description_set_weight(description, weight)
        else:
            self.weight = pango.pango_font_description_get_weight(description)
        if self.variations.get('ital'):
            pango.pango_font_description_set_style(
                description, pango.PANGO_STYLE_ITALIC)
        elif self.variations.get('slnt'):
            pango.pango_font_description_set_style(
                description, pango.PANGO_STYLE_OBLIQUE)
        if (width := self.variations.get('wdth')) is not None:
            stretch = min(
                PANGO_STRETCH_PERCENT.items(),
                key=lambda item: abs(item[0] - width))[1]
            pango.pango_font_description_set_stretch(description, stretch)
        description_string = ffi.string(
            pango.pango_font_description_to_string(description))

        # Never use the built-in hash function here: it’s not stable.
        self.hash = ''.join(
            chr(65 + letter % 26) for letter
            in md5(description_string, usedforsecurity=False).digest()[:6])

        # Set font name.
        name = re.split(b' [#@]', description_string)[0]
        self.name = b'/' + self.hash.encode() + b'+' + name.replace(b' ', b'-')

        # Set ascent and descent.
        if self.font_size:
            pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
            self.ascent = round(
                pango.pango_font_metrics_get_ascent(pango_metrics) * FROM_UNITS /
                self.font_size * 1000)
            self.descent = -round(
                pango.pango_font_metrics_get_descent(pango_metrics) * FROM_UNITS /
                self.font_size * 1000)
        else:
            self.ascent = self.descent = 0

        # Get font tables and set metadata.
        table_count = ffi.new('unsigned int *', 100)
        table_tags = ffi.new('hb_tag_t[100]')
        table_name = ffi.new('char[4]')
        harfbuzz.hb_face_get_table_tags(self.hb_face, 0, table_count, table_tags)
        self.tables = []
        for i in range(table_count[0]):
            harfbuzz.hb_tag_to_string(table_tags[i], table_name)
            self.tables.append(ffi.string(table_name).decode())
        self.bitmap = False
        if 'EBDT' in self.tables and 'EBLC' in self.tables:
            if 'glyf' in self.tables:
                tag = harfbuzz.hb_tag_from_string(b'glyf', -1)
                blob = harfbuzz.hb_face_reference_table(self.hb_face, tag)
                if harfbuzz.hb_blob_get_length(blob) == 0:
                    self.bitmap = True
                harfbuzz.hb_blob_destroy(blob)
            else:
                self.bitmap = True
        self.italic_angle = 0  # TODO: this should be different
        self.upem = harfbuzz.hb_face_get_upem(self.hb_face)
        self.png = harfbuzz.hb_ot_color_has_png(self.hb_face)
        self.svg = harfbuzz.hb_ot_color_has_svg(self.hb_face)
        self.stemv = 80
        self.stemh = 80
        self.widths = {}
        self.cmap = {}
        self.used_in_forms = False

        # Set font flags.
        self.flags = 2 ** (3 - 1)  # Symbolic, custom character set
        if self.style:
            self.flags += 2 ** (7 - 1)  # Italic
        if b'Serif' in name.split(b' '):
            self.flags += 2 ** (2 - 1)  # Serif

    def clean(self, cmap, hinting):
        """Remove useless data from font."""

        # Subset font.
        self.subset(cmap, hinting)

        # Transform variable into static font.
        if 'fvar' in self.tables:
            full_font = io.BytesIO(self.file_content)
            ttfont = TTFont(full_font, fontNumber=self.index)
            if 'wght' not in self.variations:
                self.variations['wght'] = self.weight
            if 'opsz' not in self.variations:
                self.variations['opsz'] = self.font_size
            if 'slnt' not in self.variations:
                slnt = 0
                if self.style == 1:
                    for axe in ttfont['fvar'].axes:
                        if axe.axisTag == 'slnt':
                            if axe.maxValue == 0:
                                slnt = axe.minValue
                            else:
                                slnt = axe.maxValue
                            break
                self.variations['slnt'] = slnt
            if 'ital' not in self.variations:
                self.variations['ital'] = int(self.style == 2)
            partial_font = io.BytesIO()
            try:
                ttfont = instantiateVariableFont(ttfont, self.variations)
                for key, (advance, bearing) in ttfont['hmtx'].metrics.items():
                    if advance < 0:
                        ttfont['hmtx'].metrics[key] = (0, bearing)
                ttfont.save(partial_font)
            except Exception:
                LOGGER.warning('Unable to mutate variable font')
            else:
                self.file_content = partial_font.getvalue()

        # Remove images.
        if self.png or self.svg:
            full_font = io.BytesIO(self.file_content)
            ttfont = TTFont(full_font, fontNumber=self.index)
            try:
                # Add empty glyphs instead of PNG or SVG emojis.
                if 'loca' not in self.tables or 'glyf' not in self.tables:
                    ttfont['loca'] = ttFont.getTableClass('loca')()
                    ttfont['glyf'] = ttFont.getTableClass('glyf')()
                    ttfont['glyf'].glyphOrder = ttfont.getGlyphOrder()
                    ttfont['glyf'].glyphs = {
                        name: ttFont.getTableModule('glyf').Glyph()
                        for name in ttfont['glyf'].glyphOrder}
                else:
                    for glyph in ttfont['glyf'].glyphs:
                        ttfont['glyf'][glyph] = ttFont.getTableModule('glyf').Glyph()
                for table_name in ('CBDT', 'CBLC', 'SVG '):
                    if table_name in ttfont:
                        del ttfont[table_name]
                output_font = io.BytesIO()
                ttfont.save(output_font)
                self.file_content = output_font.getvalue()
            except TTLibError:
                LOGGER.warning('Unable to save emoji font')

    @property
    def type(self):
        return 'otf' if self.file_content[:4] == b'OTTO' else 'ttf'

    def subset(self, cmap, hinting):
        """Remove unused glyphs and tables from font."""
        if not cmap:
            return

        if harfbuzz_subset and harfbuzz.hb_version_atleast(4, 1, 0):
            # 4.1.0 is required for hb_set_add_sorted_array.
            self._harfbuzz_subset(cmap, hinting)
        else:
            self._fonttools_subset(cmap, hinting)

    def _harfbuzz_subset(self, cmap, hinting):
        """Subset font using Harfbuzz."""
        hb_subset = ffi.gc(
            harfbuzz_subset.hb_subset_input_create_or_fail(),
            harfbuzz_subset.hb_subset_input_destroy)

        # Only keep used glyphs.
        gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
        gid_array = ffi.new(f'hb_codepoint_t[{len(cmap)}]', sorted(cmap))
        harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(cmap))

        # Set flags.
        flags = (
            harfbuzz_subset.HB_SUBSET_FLAGS_RETAIN_GIDS |
            harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
            harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
        harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)

        # Drop useless tables.
        drop_set = harfbuzz_subset.hb_subset_input_set(
            hb_subset, harfbuzz_subset.HB_SUBSET_SETS_DROP_TABLE_TAG)
        drop_tables = tuple(harfbuzz.hb_tag_from_string(name, -1) for name in (
            b'BASE', b'DSIG', b'EBDT', b'EBLC', b'EBSC', b'GPOS', b'GSUB', b'JSTF',
            b'LTSH', b'PCLT', b'SVG '))
        drop_tables_array = ffi.new(f'hb_codepoint_t[{len(drop_tables)}]', drop_tables)
        harfbuzz.hb_set_add_sorted_array(drop_set, drop_tables_array, len(drop_tables))

        # Subset font.
        hb_face = ffi.gc(
            harfbuzz_subset.hb_subset_or_fail(self.hb_face, hb_subset),
            harfbuzz.hb_face_destroy)

        # Drop empty glyphs after last one used.
        gid_set = harfbuzz_subset.hb_subset_input_glyph_set(hb_subset)
        keep = tuple(range(max(cmap) + 1))
        gid_array = ffi.new(f'hb_codepoint_t[{len(keep)}]', keep)
        harfbuzz.hb_set_add_sorted_array(gid_set, gid_array, len(keep))

        # Set flags.
        flags = (
            harfbuzz_subset.HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED |
            harfbuzz_subset.HB_SUBSET_FLAGS_DESUBROUTINIZE)
        if not hinting:
            flags |= harfbuzz_subset.HB_SUBSET_FLAGS_NO_HINTING
        harfbuzz_subset.hb_subset_input_set_flags(hb_subset, flags)

        # Subset font.
        hb_face = ffi.gc(
            harfbuzz_subset.hb_subset_or_fail(hb_face, hb_subset),
            harfbuzz.hb_face_destroy)

        # Store new font.
        if hb_face:
            file_content = get_hb_object_data(hb_face)
            if file_content:
                self.file_content = file_content
                return

        LOGGER.warning('Unable to subset font with Harfbuzz')

    def _fonttools_subset(self, cmap, hinting):
        """Subset font using Fonttools."""
        full_font = io.BytesIO(self.file_content)

        # Set subset options.
        options = subset.Options(
            retain_gids=True, passthrough_tables=True, ignore_missing_glyphs=True,
            hinting=hinting, desubroutinize=True)
        options.drop_tables += ['GSUB', 'GPOS', 'SVG']
        subsetter = subset.Subsetter(options)
        subsetter.populate(gids=cmap)

        # Subset font.
        try:
            ttfont = TTFont(full_font, fontNumber=self.index)
            with capture_logs('fontTools', level=WARNING) as logs:
                subsetter.subset(ttfont)
            for log in logs:
                LOGGER.warning(
                    'fontTools warning when subsetting "%s": %s',
                    self.family.decode(), log)
        except TTLibError:
            LOGGER.warning('Unable to subset font with fontTools')
        else:
            optimized_font = io.BytesIO()
            ttfont.save(optimized_font)
            self.file_content = optimized_font.getvalue()


def build_fonts_dictionary(pdf, fonts, compress, subset, options):
    """Build PDF dictionary for fonts."""
    pdf_fonts = pydyf.Dictionary()
    fonts_by_file_hash = {}
    for font in fonts.values():
        fonts_by_file_hash.setdefault(font.hash, []).append(font)
    font_references_by_file_hash = {}
    for file_hash, file_fonts in fonts_by_file_hash.items():
        # TODO: Find why we can have multiple fonts for one font file.
        font = file_fonts[0]
        if font.bitmap:
            continue

        # Clean font, optimize and handle emojis.
        cmap = {}
        if subset and not font.used_in_forms:
            for file_font in file_fonts:
                cmap = {**cmap, **file_font.cmap}
        font.clean(cmap, options['hinting'])

        # Include font.
        if font.type == 'otf':
            font_extra = pydyf.Dictionary({'Subtype': '/OpenType'})
        else:
            font_extra = pydyf.Dictionary({'Length1': len(font.file_content)})
        font_stream = pydyf.Stream([font.file_content], font_extra, compress=compress)
        pdf.add_object(font_stream)
        font_references_by_file_hash[file_hash] = font_stream.reference

    for font in fonts.values():
        if subset and not font.used_in_forms:
            # Only store widths and map for used glyphs
            font_widths = font.widths
            cmap = font.cmap
        else:
            # Store width and Unicode map for all glyphs
            full_font = io.BytesIO(font.file_content)
            ttfont = TTFont(full_font, fontNumber=font.index)
            font_widths, cmap = {}, {}
            for i, glyph in enumerate(ttfont.getGlyphSet().values()):
                font_widths[i] = glyph.width * 1000 / font.upem
            for letter, key in ttfont.getBestCmap().items():
                glyph = ttfont.getGlyphID(key)
                if glyph not in cmap:
                    cmap[glyph] = chr(letter)

        to_unicode = pydyf.Stream([
            b'/CIDInit /ProcSet findresource begin',
            b'12 dict begin',
            b'begincmap',
            b'/CIDSystemInfo',
            b'<< /Registry (Adobe)',
            b'/Ordering (UCS)',
            b'/Supplement 0',
            b'>> def',
            b'/CMapName /Adobe-Identity-UCS def',
            b'/CMapType 2 def',
            b'1 begincodespacerange',
            b'<0000> <ffff>',
            b'endcodespacerange'], compress=compress)
        cmap_length = len(cmap)
        cmap_items = tuple(cmap.items())
        for i in range(ceil(cmap_length / 100)):
            batch_length = min(100, cmap_length - i * 100)
            to_unicode.stream.append(f'{batch_length} beginbfchar'.encode())
            for glyph, text in cmap_items[i*100:(i+1)*100]:
                unicode_codepoints = ''.join(
                    f'{letter.encode("utf-16-be").hex()}' for letter in text)
                to_unicode.stream.append(
                    f'<{glyph:04x}> <{unicode_codepoints}>'.encode())
            to_unicode.stream.append(b'endbfchar')
        to_unicode.stream.extend([
            b'endcmap',
            b'CMapName currentdict /CMap defineresource pop',
            b'end',
            b'end'])
        pdf.add_object(to_unicode)
        font_dictionary = pydyf.Dictionary({
            'Type': '/Font',
            'Subtype': f'/Type{3 if font.bitmap else 0}',
            'BaseFont': font.name,
            'ToUnicode': to_unicode.reference,
        })

        if font.bitmap:
            _build_bitmap_font_dictionary(
                font_dictionary, pdf, font, font_widths, compress, subset)
        else:
            _build_vector_font_dictionary(
                font_dictionary, pdf, font, font_widths, compress,
                font_references_by_file_hash[font.hash], options['pdf_version'])
        pdf.add_object(font_dictionary)
        pdf_fonts[font.hash] = font_dictionary.reference

    return pdf_fonts


def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths, compress, subset):
    # https://docs.microsoft.com/typography/opentype/spec/ebdt
    font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1])
    font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0])
    if subset:
        chars = tuple(sorted(font.cmap))
    else:
        chars = tuple(range(256))
    first, last = chars[0], chars[-1]
    differences = []
    for glyph in sorted(widths):
        if glyph - 1 not in widths:
            differences.append(glyph)
        differences.append(f'/{glyph}')
    font_dictionary['FirstChar'] = first
    font_dictionary['LastChar'] = last
    font_dictionary['Encoding'] = pydyf.Dictionary({
        'Type': '/Encoding',
        'Differences': pydyf.Array(differences),
    })
    char_procs = pydyf.Dictionary({})
    full_font = io.BytesIO(font.file_content)
    ttfont = TTFont(full_font, fontNumber=font.index)
    font_glyphs = ttfont['EBDT'].strikeData[0]
    widths = [0] * (last - first + 1)
    glyphs_info = {}
    for key, glyph in font_glyphs.items():
        glyph_format = glyph.getFormat()
        glyph_id = ttfont.getGlyphID(key)

        # Get and store glyph metrics.
        if glyph_format == 5:
            data = glyph.data
            subtables = ttfont['EBLC'].strikes[0].indexSubTables
            for subtable in subtables:
                first_index = subtable.firstGlyphIndex
                last_index = subtable.lastGlyphIndex
                if first_index <= glyph_id <= last_index:
                    height = subtable.metrics.height
                    advance = width = subtable.metrics.width
                    bearing_x = subtable.metrics.horiBearingX
                    bearing_y = subtable.metrics.horiBearingY
                    break
            else:
                LOGGER.warning(f'Unknown bitmap metrics for glyph: {glyph_id}')
                continue
        else:
            data_start = 5 if glyph_format in (1, 2, 8) else 8
            data = glyph.data[data_start:]
            height, width = glyph.data[0:2]
            bearing_x = int.from_bytes(glyph.data[2:3], 'big', signed=True)
            bearing_y = int.from_bytes(glyph.data[3:4], 'big', signed=True)
            advance = glyph.data[4]
        position_y = bearing_y - height
        if glyph_id in chars:
            widths[glyph_id - first] = advance
        stride = ceil(width / 8)
        glyph_info = glyphs_info[glyph_id] = {
            'width': width,
            'height': height,
            'x': bearing_x,
            'y': position_y,
            'stride': stride,
            'bitmap': None,
            'subglyphs': None,
        }

        # Decode bitmaps.
        if 0 in (width, height) or not data:
            glyph_info['bitmap'] = b''
        elif glyph_format in (1, 6):
            glyph_info['bitmap'] = data
        elif glyph_format in (2, 5, 7):
            padding = (8 - (width % 8)) % 8
            bits = bin(int(data.hex(), 16))[2:]
            bits = bits.zfill(8 * len(data))
            bitmap_bits = ''.join(
                bits[i * width:(i + 1) * width] + padding * '0'
                for i in range(height))
            glyph_info['bitmap'] = int(bitmap_bits, 2).to_bytes(height * stride, 'big')
        elif glyph_format in (8, 9):
            subglyphs = glyph_info['subglyphs'] = []
            i = 0 if glyph_format == 9 else 1
            number_of_components = int.from_bytes(data[i:i+2], 'big')
            for j in range(number_of_components):
                index = (i + 2) + (j * 4)
                subglyph_id = int.from_bytes(data[index:index+2], 'big')
                x = int.from_bytes(data[index+2:index+3], 'big', signed=True)
                y = int.from_bytes(data[index+3:index+4], 'big', signed=True)
                subglyphs.append({'id': subglyph_id, 'x': x, 'y': y})
        else:  # pragma: no cover
            LOGGER.warning(f'Unsupported bitmap glyph format: {glyph_format}')
            glyph_info['bitmap'] = bytes(height * stride)

    for glyph_id, glyph_info in glyphs_info.items():
        # Don’t store glyph not in cmap.
        if glyph_id not in chars:
            continue

        # Draw glyph.
        stride = glyph_info['stride']
        width = glyph_info['width']
        height = glyph_info['height']
        x = glyph_info['x']
        y = glyph_info['y']
        if glyph_info['bitmap'] is None:
            length = height * stride
            bitmap_int = int.from_bytes(bytes(length), 'big')
            for subglyph in glyph_info['subglyphs']:
                sub_x = subglyph['x']
                sub_y = subglyph['y']
                sub_id = subglyph['id']
                if sub_id not in glyphs_info:
                    LOGGER.warning(f'Unknown subglyph: {sub_id}')
                    continue
                subglyph = glyphs_info[sub_id]
                if subglyph['bitmap'] is None:
                    # TODO: Support subglyph in subglyph.
                    LOGGER.warning(f'Unsupported subglyph in subglyph: {sub_id}')
                    continue
                for row_y in range(subglyph['height']):
                    row_slice = slice(
                        row_y * subglyph['stride'],
                        (row_y + 1) * subglyph['stride'])
                    row = subglyph['bitmap'][row_slice]
                    row_int = int.from_bytes(row, 'big')
                    shift = stride * 8 * (height - sub_y - row_y - 1)
                    stride_difference = stride - subglyph['stride']
                    if stride_difference > 0:
                        row_int <<= stride_difference * 8
                    elif stride_difference < 0:
                        row_int >>= -stride_difference * 8
                    if sub_x > 0:
                        row_int >>= sub_x
                    elif sub_x < 0:
                        row_int <<= -sub_x
                    row_int %= 1 << stride * 8
                    row_int <<= shift
                    bitmap_int |= row_int
            bitmap = bitmap_int.to_bytes(length, 'big')
        else:
            bitmap = glyph_info['bitmap']
        bitmap_stream = pydyf.Stream([
            b'0 0 d0',
            f'{width} 0 0 {height} {x} {y} cm'.encode(),
            b'BI',
            b'/IM true',
            b'/W', width,
            b'/H', height,
            b'/BPC 1',
            b'/D [1 0]',
            b'ID', bitmap, b'EI'
        ], compress=compress)
        pdf.add_object(bitmap_stream)
        char_procs[glyph_id] = bitmap_stream.reference

    pdf.add_object(char_procs)
    font_dictionary['Widths'] = pydyf.Array(widths)
    font_dictionary['CharProcs'] = char_procs.reference


def _build_vector_font_dictionary(font_dictionary, pdf, font, widths, compress,
                                  reference, pdf_version):
    font_file = f'FontFile{3 if font.type == "otf" else 2}'
    max_x = max(widths.values()) if widths else 0
    bbox = (0, font.descent, max_x, font.ascent)
    flags = font.flags
    if len(widths) > 1 and len(set(font.widths.values())) == 1:
        flags += 2 ** (1 - 1)  # FixedPitch
    font_descriptor = pydyf.Dictionary({
        'Type': '/FontDescriptor',
        'FontName': font.name,
        'FontFamily': pydyf.String(font.family),
        'Flags': flags,
        'FontBBox': pydyf.Array(bbox),
        'ItalicAngle': font.italic_angle,
        'Ascent': font.ascent,
        'Descent': font.descent,
        'CapHeight': bbox[3],
        'StemV': font.stemv,
        'StemH': font.stemh,
        font_file: reference,
    })
    if str(pdf_version) <= '1.4':  # Cast for bytes and None
        cids = sorted(font.widths)
        padded_width = ceil((cids[-1] + 1) / 8)
        bits = ['0'] * padded_width * 8
        for cid in cids:
            bits[cid] = '1'
        stream = pydyf.Stream(
            (int(''.join(bits), 2).to_bytes(padded_width, 'big'),),
            compress=compress)
        pdf.add_object(stream)
        font_descriptor['CIDSet'] = stream.reference
    if font.type == 'otf':
        font_descriptor['Subtype'] = '/OpenType'
    pdf.add_object(font_descriptor)

    pdf_widths = pydyf.Array()
    for i in sorted(widths):
        if i - 1 not in widths:
            pdf_widths.append(i)
            current_widths = pydyf.Array()
            pdf_widths.append(current_widths)
        current_widths.append(widths[i])

    subfont_dictionary = pydyf.Dictionary({
        'Type': '/Font',
        'Subtype': f'/CIDFontType{0 if font.type == "otf" else 2}',
        'BaseFont': font.name,
        'CIDSystemInfo': pydyf.Dictionary({
            'Registry': pydyf.String('Adobe'),
            'Ordering': pydyf.String('Identity'),
            'Supplement': 0,
        }),
        'CIDToGIDMap': '/Identity',
        'W': pdf_widths,
        'FontDescriptor': font_descriptor.reference,
    })
    pdf.add_object(subfont_dictionary)
    font_dictionary['Encoding'] = '/Identity-H'
    font_dictionary['DescendantFonts'] = pydyf.Array([subfont_dictionary.reference])