Prusa-Firmware/lang/lang-extract.py

#!/usr/bin/env python3
import argparse
import bisect
import codecs
import polib
import regex
import sys
import lib.charset as cs

def line_warning(path, line, msg):
    print(f'{path}:{line}: {msg}', file=sys.stderr)

def line_error(path, line, msg):
    print(f'{path}:{line}: {msg}', file=sys.stderr)

def entry_warning_locs(entries):
    for msgid, data in entries:
        print('   text: ' + repr(msgid), file=sys.stderr)
        positions = ', '.join(map(lambda x: x[0] + ':' + str(x[1]), data['occurrences']))
        print('     in: ' + positions, file=sys.stderr)

def entries_warning(entries, msg):
    print('warning: ' + msg, file=sys.stderr)
    entry_warning_locs(entries)

def entry_warning(entry, msg):
    entries_warning([entry], msg)


def newline_positions(source):
    lines = [-1]
    while True:
        idx = source.find('\n', lines[-1] + 1)
        if idx < 0:
            break
        lines.append(idx)
    if lines[-1] != len(source) - 1:
        lines.append(len(source) - 1)
    return lines[1:]

def index_to_line(index, lines):
    return bisect.bisect_left(lines, index) + 1


def extract_file(path, catalog, warn_skipped=False):
    source = open(path).read()
    newlines = newline_positions(source)

    # match internationalized quoted strings
    RE_START = r'\b (_[iI]|ISTR) \s* \('
    RE_META = r'//// \s* ([^\n]*)$'

    RE_I = fr'''
        (?<!(?:/[/*]|^\s*\#) [^\n]*)  # not on a comment or preprocessor
        {RE_START}                    # $1 ref type _i( or ISTR(
        (?:
          \s*
          ("(?:[^"\\]|\\.)*")         # $2 quoted string (chunk)
          (?:\s* {RE_META} )?         # $3 inline metadata
        )+
        \s* \)                        # )
        (?:
          (?:[^\n] (?!{RE_START}))*   # anything except another entry
          {RE_META}                   # $5 final metadata
        )?
    '''

    r_ref_type = 1
    r_quoted_chunk = 2
    r_inline_data = 3
    r_eol_data = 5

    for m in regex.finditer(RE_I, source, regex.M|regex.X):
        # parse the text
        line = index_to_line(m.start(0), newlines)

        text = ""
        for block in m.captures(r_quoted_chunk):
            # remove quotes and unescape
            block = block[1:-1]
            block = codecs.decode(block, 'unicode-escape', 'strict')
            block = cs.source_to_unicode(block)
            text += block

        # check if text is non-empty
        if len(text) == 0:
            line_warning(path, line, 'empty source text, ignored')
            continue

        data = set()
        comments = set()
        for n in [r_inline_data, r_eol_data]:
            meta = m.group(n)
            if meta is not None:
                meta_parts = meta.split('//', 1)
                data.add(meta_parts[0].strip())
                if len(meta_parts) > 1:
                    comments.add(meta_parts[1].strip())

        # check if this message should be ignored
        ignored = False
        for meta in data:
            if regex.search(r'\bIGNORE\b', meta) is not None:
                ignored = True
                break
        if ignored:
            if warn_skipped:
                line_warning(path, line, 'skipping explicitly ignored translation')
            continue

        # extra message catalog name (if any)
        cat_name = set()
        for meta in data:
            sm = regex.search(r'\b(MSG_\w+)', meta)
            if sm is not None:
                cat_name.add(sm.group(1))

        # reference type annotation
        ref_type = 'def' if m.group(r_ref_type) == 'ISTR' else 'ref'
        if ref_type == 'def':
            # ISTR definition: lookup nearby assignment
            lineup_def = source[newlines[line-2]+1:m.end(r_ref_type)]
            sm = regex.search(r'\b PROGMEM_(\S+) \s*=\s* ISTR $', lineup_def, regex.M|regex.X)
            if sm is None:
                line_warning(path, line, 'ISTR not used in an assignment')
            elif sm.group(1) != 'I1':
                line_warning(path, line, 'ISTR not used with PROGMEM_I1')

        # append the translation to the catalog
        pos = [(path, line)]
        entry = catalog.get(text)
        if entry is None:
            catalog[text] = {'occurrences': set(pos),
                             'data': data,
                             'cat_name': cat_name,
                             'comments': comments,
                             'ref_type': set([ref_type])}
        else:
            entry['occurrences'] = entry['occurrences'].union(pos)
            entry['data'] = entry['data'].union(data)
            entry['cat_name'] = entry['cat_name'].union(cat_name)
            entry['comments'] = entry['comments'].union(comments)
            entry['ref_type'].add(ref_type)


def extract_refs(path, catalog):
    source = open(path).read()
    newlines = newline_positions(source)

    # match message catalog references to add backrefs
    RE_CAT = r'''
        (?<!(?:/[/*]|^\s*\#) [^\n]*)          # not on a comment or preprocessor
        \b (?:_[TOR]) \s* \( \s* (\w+) \s* \) # $1 catalog name
    '''

    for m in regex.finditer(RE_CAT, source, regex.M|regex.X):
        line = index_to_line(m.start(0), newlines)
        pos = [(path, line)]
        cat_name = m.group(1)
        found = False
        defined = False
        for entry in catalog.values():
            if cat_name in entry['cat_name']:
                entry['occurrences'] = entry['occurrences'].union(pos)
                entry['ref_type'].add('ref')
                found = True
                if 'def' in entry['ref_type']:
                    defined = True
        if not found:
            line_error(path, line, f'{cat_name} not found')
        elif not defined:
            line_error(path, line, f'{cat_name} referenced but never defined')


def check_entries(catalog, warn_missing, warn_same_line):
    cat_entries = {}

    for entry in catalog.items():
        msgid, data = entry

        # ensure we have at least one name
        if len(data['cat_name']) == 0 and warn_missing:
            entry_warning(entry, 'missing MSG identifier')

        # ensure references are being defined
        if data['ref_type'] == set(['def']):
            if len(data['cat_name']) == 0:
                if warn_missing:
                    entry_warning(entry, 'entry defined, but never used')
            else:
                id_name = next(iter(data['cat_name']))
                entry_warning(entry, f'{id_name} defined, but never used')

        # check custom characters
        invalid_char = cs.source_check(msgid)
        if invalid_char is not None:
            entry_warning(entry, 'source contains unhandled custom character ' + repr(invalid_char))

        tokens = []
        for meta in data['data']:
            tokens.extend(regex.split(r'\s+', meta))
        seen_keys = set()
        for token in tokens:
            if len(token) == 0:
                continue

            # check metadata syntax
            if regex.match(r'[cr]=\d+$', token) is None and \
               regex.match(r'MSG_[A-Z_0-9]+$', token) is None:
                entry_warning(entry, 'bogus annotation: ' + repr(token))

            # check for repeated keys
            key = regex.match(r'([^=])+=', token)
            if key is not None:
                key_name = key.group(1)
                if key_name in seen_keys:
                    entry_warning(entry, 'repeated annotation: ' + repr(token))
                else:
                    seen_keys.add(key_name)

            # build the inverse catalog map
            if token.startswith('MSG_'):
                if token not in cat_entries:
                    cat_entries[token] = [entry]
                else:
                    cat_entries[token].append(entry)

    # ensure the same id is not used in multiple entries
    for cat_name, entries in cat_entries.items():
        if len(entries) > 1:
            entries_warning(entries, f'{cat_name} used in multiple translations')

    if warn_same_line:
        # build the inverse location map
        entry_locs = {}
        for entry in catalog.items():
            msgid, data = entry
            for loc in data['occurrences']:
                if loc not in entry_locs:
                    entry_locs[loc] = [loc]
                else:
                    entry_locs[loc].append(loc)

        # check for multiple translations on the same location
        for loc, entries in entry_locs.items():
            if len(entries) > 1:
                line_warning(loc[0], loc[1], f'line contains multiple translations')


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('-o', dest='pot', required=True, help='PO template output file')
    ap.add_argument('--no-missing', action='store_true',
                    help='Do not warn about missing MSG entries')
    ap.add_argument('--warn-same-line', action='store_true',
                    help='Warn about multiple translations on the same line')
    ap.add_argument('--warn-skipped', action='store_true',
                    help='Warn about explicitly ignored translations')
    ap.add_argument('-s', '--sort', action='store_true',
                    help='Sort output catalog')
    ap.add_argument('file', nargs='+', help='Input files')
    args = ap.parse_args()

    # extract strings
    catalog = {}
    for path in args.file:
        extract_file(path, catalog, warn_skipped=args.warn_skipped)

    # process backreferences in a 2nd pass
    for path in args.file:
        extract_refs(path, catalog)

    # check the catalog entries
    check_entries(catalog, warn_missing=not args.no_missing, warn_same_line=args.warn_same_line)

    # write the output PO template
    po = polib.POFile()
    po.metadata = {
        'Language': 'en',
        'MIME-Version': '1.0',
        'Content-Type': 'text/plain; charset=utf-8',
        'Content-Transfer-Encoding': '8bit'}

    messages = catalog.keys()
    if args.sort:
        messages = sorted(messages)
    for msgid in messages:
        data = catalog[msgid]
        comment = ', '.join(data['data'])
        if len(data['comments']):
            comment += '\n' + '\n'.join(data['comments'])
        occurrences = data['occurrences']
        if args.sort:
            occurrences = list(sorted(occurrences))
        po.append(
            polib.POEntry(
                msgid=msgid,
                comment=comment,
                occurrences=occurrences))

    po.save(args.pot)
    return 0

if __name__ == '__main__':
    exit(main())