Prusa-Firmware/lang/lang-build.py

#!/usr/bin/env python3
from collections import defaultdict
import codecs
import argparse
import os
import polib
import struct
import sys

import lib.charset as cs
from lib.io import info, warn, fatal, load_map

FW_MAGIC = 0x4bb45aa5


def translation_ref(translation):
    cmt = translation.comment
    if cmt and cmt.startswith('MSG_'):
        return cmt.split(' ', 1)[0]
    else:
        return repr(translation.msgid)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--warn-unused', action='store_true',
                    help='Warn about unused translations')
    ap.add_argument('--show-coalesced', action='store_true',
                    help='List coalesced translations')
    ap.add_argument('map', help='Firmware symbol map file')
    ap.add_argument('po', help='PO file')
    ap.add_argument('out', help='output')
    args = ap.parse_args()

    # check arguments
    for path in [args.map, args.po]:
        if not os.path.isfile(path):
            fatal("{} does not exist or is not a regular file".format(args.po))

    # load the map file
    syms = load_map(args.map)
    fw_sig_data = None
    msgid_data = defaultdict(list)
    id_msgid = {}
    sym_cnt = 0
    for sym in syms:
        if sym['name'] == '_PRI_LANG_SIGNATURE':
            fw_sig_data = sym['data']
        else:
            # redo forward text transformation for transparent matching
            msgid = cs.source_to_unicode(codecs.decode(sym['data'], 'unicode_escape', 'strict'))
            msgid_data[msgid].append(sym)
            id_msgid[sym['id']] = msgid

            # update the max symbol count
            if sym_cnt <= sym['id']:
                sym_cnt = sym['id'] + 1

    if fw_sig_data is None:
        fatal('_PRI_LANG_SIGNATURE not found in map')

    # open translations
    po = polib.pofile(args.po)
    lang_code = po.metadata['Language']
    if not lang_code.isascii() or len(lang_code) != 2:
        fatal(f'unsupported language code {lang_code}')

    # build a catalog of all translations
    trans_table = {}
    for translation in po:
        msgid = translation.msgid
        found = msgid in msgid_data
        if found:
            trans_table[msgid] = (translation, msgid_data[msgid])
        elif args.warn_unused:
            err = "{}:{}".format(args.po, translation.linenum)
            err += ": unused translation "
            err += translation_ref(translation)
            warn(err)

    for msgid, syms in msgid_data.items():
        if msgid not in trans_table:
            # warn about missing translations
            warn("untranslated text: " + repr(msgid))

    # write the binary catalog
    with open(args.out, "w+b") as fd:
        fixed_offset = 16+2*sym_cnt
        written_locs = {}

        # compute final data tables
        offsets = b''
        strings = b'\0'
        for i in range(sym_cnt):
            msgid = id_msgid.get(i)
            translation = trans_table.get(msgid)
            if translation is None or len(translation[0].msgstr) == 0 or translation[0].msgstr == msgid:
                # first slot reserved for untraslated/identical entries
                offsets += struct.pack("<H", fixed_offset)
            else:
                string_bin = cs.unicode_to_source(translation[0].msgstr)

                # check for invalid characters
                invalid_char = cs.translation_check(string_bin)
                if invalid_char is not None:
                    line = translation[0].linenum
                    warn(f'{args.po}:{line} contains unhandled character ' + repr(invalid_char))

                string_bin = string_bin.encode('raw_unicode_escape', 'ignore')
                string_off = written_locs.get(string_bin)
                offset = fixed_offset + len(strings)
                if string_off is not None:
                    # coalesce repeated strings
                    if args.show_coalesced:
                        info(f'coalescing {offset:04x}:{string_off:04x} {string_bin}')
                    offset = string_off
                else:
                    # allocate a new string
                    written_locs[string_bin] = offset
                    strings += string_bin + b'\0'
                offsets += struct.pack("<H", offset)

        # header
        size = 16 + len(offsets) + len(strings)
        header = struct.pack(
            "<IHHHHI",
            FW_MAGIC,
            size,
            sym_cnt,
            0, # no checksum yet
            (ord(lang_code[0]) << 8) + ord(lang_code[1]),
            fw_sig_data)

        fd.write(header)
        fd.write(offsets)
        fd.write(strings)

        # calculate and update the checksum
        cksum = 0
        fd.seek(0)
        for i in range(size):
            cksum += (ord(fd.read(1)) << (0 if i % 2 else 8))
            cksum &= 0xffff
        fd.seek(8)
        fd.write(struct.pack("<H", cksum))

    return 0


if __name__ == '__main__':
    exit(main())
New PO-based language translation support (#3471) * lang: Add a PO language extractor with FW metadata support Implement a straight-to-po language extractor which supports our custom language requirements: - _i/_I/ISTR for text string definitions - _T for catalog translations (with back-reference support) - //// EOL comments with: - MSG_ catalog entry name identifiers - c=X r=Y annotations for screen dimensioning checks - Crude support for commented lines All source locations are correctly referenced in the PO, with the metadata colleted in the comment for further processing. Several checks are implemented already during extraction: - Correct catalog name assignment (no duplicates) - Metadata checks for each entry Further checks will be implemented by directly checking the translated PO file. Requires "polib" and "regex" python modules. * lang: Adapt lang-check to work directly on PO/POT files * lang: Allow lang-extract to generate stable (pre-sorted) output directly * lang: Further extend lang-extract consistency/error checking - Do not parse inside preprocessor conditionals - Distinguish between references and definitions - Warn about missing references and definitions * lang: lang-extract: warn about incorrect PROGMEM assignments Check that ISTR is used along with PROGMEM_I1 in an attempt to spot useless translated catalogs. * lang: lang-extract: Improved handling of same-line translations Correctly reference metadata on same-line translations. * lang: lang-extract: Handle _O as a cat-ref https://github.com/prusa3d/Prusa-Firmware/pull/3434 * lang: lang-extract: Warn about unused catalog definitions * lang: lang-extract: Allow propagating translation comments via // The definition: code //// definition [// comment] will check [definition] as before, but blindly accumulate // comment. The comment is then re-appended back into the PO files for translators with the form: definition comment comment... * lang: Fix incorrect display definitions * lang: lang-extract: Check source encoding/charmap * lang: Translate the degree symbol * lang: Unbreak/cleanup DEBUG_SEC_LANG * lang: Improve meaning of comment * lang: Split charset conversions into an aux lib for future use * lang: Implement lang-map.py to extract the translation symbol map - Extracts the translatable symbol map for further use - Computes a stable "language signature" from the map itself - Optionally patches the binary update the symbols * lang: Check for translation recoding problems * lang: Implement a transliteration map to post-process translations TRANS_CHARS is now used to replace unavailable symbols to the source encoding, only while producing the language catalog. * lang: Handle/check character replacements in lang-check Filter the translation through TRANS_CHARS, so that the preview and length check are performed correctly for expanding replacements such as 'ß' to 'ss'. * lang: Implement lang-build.py to generate the final language catalog * Cleanup .gitignore * lang: Drop txt language files * lang: Remove outdated translation scripts and obsolete docs * lang: Update build scripts for new infrastructure * lang: [no] Integrate accents from po/new/no.po We now support accents natively * lang: Remove redundant directory po/new/ * lang: Fix encoding of LCD characters in PO files * lang: [hr] Fix wrapping in MSG_CRASH_DET_ONLY_IN_NORMAL * lang: Sort and reformat PO files for further massaging * lang: Switch to developer (dot) comments for PO metadata * lang: Allow the IGNORE annotation to skip extraction * lang: Fix missing/broken language metadata in sources * lang: Add update-pot.sh and regenerate po/Firmware.pot * lang: Add update-po.sh and refresh all PO files * lang: Add summary documentation about the new translation workflow * Add more ignored files * CI: Add new required dependencies to travis * lang: lang-build: Improve warning message "referenced" was really meaning that data is being duplicated. * lang: Respect the language order as defined in config.sh This correctly splits normal and community-made entries during language selection. * lang: More typos in the documentation * lang: Check for the maximum size of each language Each table needs to fit within LANG_SIZE_RESERVED * lang: Properly align _SEC_LANG to page boundaries ... instead of relying on _SEC_LANG_TABLE to calculate the offset * lang: Build support for dual-language hex files Detect the printer type by checking the current variant type. On printers with no xflash (MK2), generate one hex file for each additional language file by patching the built-in secondary language table during the build process lang: Mention lang-patchsec.py * lang: Use color() instead of tput for clarity * lang: Allow disabling terminal colors with NO_COLOR/TERM=dumb * lang: Consistent use of redirection in config.sh * lang: Stricter variant-type check for xflash support * lang: Output size stats when building double-language hex files * lang: Respect NO_COLOR in lang-check.py * lang: Check for repeated/incorrect annotations Catch errors such as "c=1 c=2" * lang: Correct MSG_SLIGHT_SKEW/MSG_SEVERE_SKEW annotations * lang: [it] Improve MSG__SKEW translation lang: Use INTLHEX instead of OUTHEX_P/S for configuration We already have OUTHEX which is the compiled firmware. Use INTLHEX for the final internationalized firmware, which is less confusing. Also, assume it being a prefix for all generated hex files, which reduces the number of variables set. * lang: Move lang_map to lib.io for further use * lang: lang-check: Accept a firmware map file to suppress unused string warnings * lang: Use the map file to reduce useless warnings during fw-build * lang: lang-check: Also suppress unused empty annotations * lang: Fix MSG_MOVE_CARRIAGE_TO_THE_TOP_Z annotation Refresh pot file * lang: lang-check: Do not warn about same-word translations by default Do not warn when one-word translations such as "No" result in "No" also in other languages, since this is common in latin languages. Allow to re-enable the warning with --warn-same * lang: lang-build: Handle same-source/translation efficiently * lang: [it] Explicitly add On/Off/Reset/Wizard to suppress warnings Instead of displaying a warning, supress the warning and explicitly translate each entry using english (which is the common/acceptable word in these cases). * lang: [it] Suppress more warnings * lang: lang-check: Add intermediate "suggest" warning category Warnings in the "suggest" category as shown as [S] as based on pure speculation from the checking tool, such as the translation being significantly shorter than the original. As a result, they can be suppressed with --no-suggest * lang: Return translation status from lang-check - 0 if the translation only contains suggestions - 1 if the translation contains warnings or errors Check for the exit status in fw-build.sh, but do nothing at the moment except printing a non-fatal error. * lang: Remove "trim_trailing_whitespace=false" for po files PO files got cleaned up/rewritten. We can now ensure they stay consistent. * lang: [sv] Re-integrate changes from 70c73cb * lang: [no] Reintegrate changes from @pkg2000 2022-06-16 13:03:30 +00:00			`#!/usr/bin/env python3`
			`from collections import defaultdict`
			`import codecs`
			`import argparse`
			`import os`
			`import polib`
			`import struct`
			`import sys`

			`import lib.charset as cs`
			`from lib.io import info, warn, fatal, load_map`

			`FW_MAGIC = 0x4bb45aa5`


			`def translation_ref(translation):`
			`cmt = translation.comment`
			`if cmt and cmt.startswith('MSG_'):`
			`return cmt.split(' ', 1)[0]`
			`else:`
			`return repr(translation.msgid)`


			`def main():`
			`ap = argparse.ArgumentParser()`
			`ap.add_argument('--warn-unused', action='store_true',`
			`help='Warn about unused translations')`
			`ap.add_argument('--show-coalesced', action='store_true',`
			`help='List coalesced translations')`
			`ap.add_argument('map', help='Firmware symbol map file')`
			`ap.add_argument('po', help='PO file')`
			`ap.add_argument('out', help='output')`
			`args = ap.parse_args()`

			`# check arguments`
			`for path in [args.map, args.po]:`
			`if not os.path.isfile(path):`
			`fatal("{} does not exist or is not a regular file".format(args.po))`

			`# load the map file`
			`syms = load_map(args.map)`
			`fw_sig_data = None`
			`msgid_data = defaultdict(list)`
			`id_msgid = {}`
			`sym_cnt = 0`
			`for sym in syms:`
			`if sym['name'] == '_PRI_LANG_SIGNATURE':`
			`fw_sig_data = sym['data']`
			`else:`
			`# redo forward text transformation for transparent matching`
			`msgid = cs.source_to_unicode(codecs.decode(sym['data'], 'unicode_escape', 'strict'))`
			`msgid_data[msgid].append(sym)`
			`id_msgid[sym['id']] = msgid`

			`# update the max symbol count`
			`if sym_cnt <= sym['id']:`
			`sym_cnt = sym['id'] + 1`

			`if fw_sig_data is None:`
			`fatal('_PRI_LANG_SIGNATURE not found in map')`

			`# open translations`
			`po = polib.pofile(args.po)`
			`lang_code = po.metadata['Language']`
			`if not lang_code.isascii() or len(lang_code) != 2:`
			`fatal(f'unsupported language code {lang_code}')`

			`# build a catalog of all translations`
			`trans_table = {}`
			`for translation in po:`
			`msgid = translation.msgid`
			`found = msgid in msgid_data`
			`if found:`
			`trans_table[msgid] = (translation, msgid_data[msgid])`
			`elif args.warn_unused:`
			`err = "{}:{}".format(args.po, translation.linenum)`
			`err += ": unused translation "`
			`err += translation_ref(translation)`
			`warn(err)`

			`for msgid, syms in msgid_data.items():`
			`if msgid not in trans_table:`
			`# warn about missing translations`
			`warn("untranslated text: " + repr(msgid))`

			`# write the binary catalog`
			`with open(args.out, "w+b") as fd:`
			`fixed_offset = 16+2*sym_cnt`
			`written_locs = {}`

			`# compute final data tables`
			`offsets = b''`
			`strings = b'\0'`
			`for i in range(sym_cnt):`
			`msgid = id_msgid.get(i)`
			`translation = trans_table.get(msgid)`
			`if translation is None or len(translation[0].msgstr) == 0 or translation[0].msgstr == msgid:`
			`# first slot reserved for untraslated/identical entries`
			`offsets += struct.pack("<H", fixed_offset)`
			`else:`
			`string_bin = cs.unicode_to_source(translation[0].msgstr)`

			`# check for invalid characters`
			`invalid_char = cs.translation_check(string_bin)`
			`if invalid_char is not None:`
			`line = translation[0].linenum`
			`warn(f'{args.po}:{line} contains unhandled character ' + repr(invalid_char))`

			`string_bin = string_bin.encode('raw_unicode_escape', 'ignore')`
			`string_off = written_locs.get(string_bin)`
			`offset = fixed_offset + len(strings)`
			`if string_off is not None:`
			`# coalesce repeated strings`
			`if args.show_coalesced:`
			`info(f'coalescing {offset:04x}:{string_off:04x} {string_bin}')`
			`offset = string_off`
			`else:`
			`# allocate a new string`
			`written_locs[string_bin] = offset`
			`strings += string_bin + b'\0'`
			`offsets += struct.pack("<H", offset)`

			`# header`
			`size = 16 + len(offsets) + len(strings)`
			`header = struct.pack(`
			`"<IHHHHI",`
			`FW_MAGIC,`
			`size,`
			`sym_cnt,`
			`0, # no checksum yet`
			`(ord(lang_code[0]) << 8) + ord(lang_code[1]),`
			`fw_sig_data)`

			`fd.write(header)`
			`fd.write(offsets)`
			`fd.write(strings)`

			`# calculate and update the checksum`
			`cksum = 0`
			`fd.seek(0)`
			`for i in range(size):`
			`cksum += (ord(fd.read(1)) << (0 if i % 2 else 8))`
			`cksum &= 0xffff`
			`fd.seek(8)`
			`fd.write(struct.pack("<H", cksum))`

			`return 0`


			`if __name__ == '__main__':`
			`exit(main())`