#!/bin/python3 """ This file is not used while TeX is running. It's for generating unicode-math-input-table.tex file only. This requires pythonimmediate (not sure which version is compatible but commit 020068db8a966c138b5b0b93695c0fefdef03d0a on Python 3.11.3 is) To generate: run:: python3 unicode-math-input-script.py > unicode-math-input-table.tex How does it work? The mapping is determined from multiple sources: * The unicode-math package itself, which defines a "command → Unicode character" mapping. This does not always work because different TeX packages may name the command differently. * Synonym table, obtained by looking at STIX's command definition * TeX's glyph → unicode mapping (used to facilitate copy-paste in PDF), e.g. /usr/share/texmf-dist/tex/generic/pdftex/glyphtounicode.tex This should be good, but is currently not used. Furthermore, not all TeX commands are implemented by getting a single character from a font... How does the Unicode mapping work? First there's the `pdftex.map` file, then there's umsa.cmap for msam10.tfm/afm/pfm/pfb/mf (metafont source file) /usr/share/texmf-dist/fonts/source/public/amsfonts/symbols/msam10.mf /usr/share/texmf-dist/fonts/source/public/amsfonts/symbols/asymbols.mf /usr/share/texmf-dist/fonts/afm/public/amsfonts/symbols/msam10.afm → plaintext-looking file may work /usr/share/texmf-dist/fonts/tfm/public/amsfonts/symbols/msam10.tfm /usr/share/texmf-dist/fonts/type1/public/amsfonts/symbols/msam10.pfm /usr/share/texmf-dist/fonts/type1/public/amsfonts/symbols/msam10.pfb The glyphtounicode.tex may be a bit problematic... https://tex.stackexchange.com/questions/66300/how-to-fix-missing-or-incorrect-mappings-from-glyphtounicode-tex See also: section 3.2 How to find a table of correspondences? in https://tex.stackexchange.com/a/628285/250119 """ from __future__ import annotations from pythonimmediate.engine import ChildProcessEngine from pythonimmediate.engine import default_engine from pythonimmediate import* import pythonimmediate from collections import defaultdict, Counter from itertools import groupby import os import json import subprocess import re import sys import unicodedata import functools from dataclasses import dataclass # ======== print(r'% This file is automatically generated from unicode-math-input-script.py.') # ======== start a luatex engine # https://tex.stackexchange.com/questions/574607/tex-hashtokens-incomplete default_engine.set_engine(ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"})) """ from the TeXbook: (INITEX starts out with \mathcode x = x for all characters x that are neither letters nor digits. The ten digits have \mathcode x = x+"7000; the 52 letters have \mathcode x = x+"7100.) """ # ======== reset all mathcode to 0 TokenList([r"\directlua", TokenList.fstr( r""" for i=0, 0x10ffff do tex.setmathcode(i, {0, 0, 0}) end """ )]).execute() # ======== load unicode-math execute(r''' \documentclass{article} \usepackage{unicode-math} \begin{document} ''') # ======== print changed mathcodes (we aim to support all of these) changed_mathcodes = TokenList([r"\directlua", TokenList.fstr( r""" for i=0, 0x10ffff do local cls, family, pos=table.unpack(tex.getmathcode(i)) if not ( --(cls==0 and family==0 and pos==i) or (cls==7 and family==1 and pos==i) (cls==0 and family==0 and pos==0) ) then tex.print(-2, i .. ":"..utf8.char(i)..":" .. cls..' '..family..' '..pos .. "\n") end end """ )]).expand_x().str() changed_chars: set[str] = set() for line in changed_mathcodes.splitlines(): match = re.fullmatch(r'(\d+):(.):(\d+) (\d+) (\d+)', line) assert match unicode_char = match[2] assert match[2]==chr(int(match[1])), match if match[3]=="8" and match[4]=="0" and match[5]=="0": code = Umathcode.active else: code = Umathcode(int(match[4]), MathClass.lookup(int(match[3])), int(match[5])) changed_chars.add(unicode_char) # ======== parse the unicode math table path = subprocess.run(["kpsewhich", "unicode-math-table.tex"], stdout=subprocess.PIPE).stdout lines = Path(path.decode('u8').strip('\n')).read_text().splitlines() lines = [line for line in lines if line and not line.startswith("%")] unicode_math_table_=defaultdict(list) for line in lines: match = re.fullmatch(r'\\UnicodeMathSymbol{"(.*)}{\\(.*?) *}{\\math(.*)}{(.*)}%', line) assert match unicode_char=chr(int(match[1], 16)) csname=match[2] #unicode_math_table_.append(Item(unicode_char=unicode_char, csname)) unicode_math_table_[unicode_char].append(csname) unicode_math_table={unicode_char: tuple(csnames) for unicode_char, csnames in unicode_math_table_.items()} # ======== extract unicode-math synonyms def control_sequences()->list[str]: return (lua_try_eval(r""" do local s={} for k, v in pairs(tex.hashtokens()) do if v:find("^[A-Za-z]+$") then s[v]=0 end end local t={} for v, _ in pairs(s) do table.insert(t, v) end return table.concat(t, "\x00") end """) or "").split("\x00") extra_synonyms_list: list[list[str]] = [ ["adots", "iddots"], ["unicodecdots", "cdots"], # https://github.com/wspr/unicode-math/issues/571 ["unicodeellipsis", "ldots"], #["llbracket", "lBrack"], #["rrbracket", "rBrack"], ] c=control_sequences() m={x: T[x].meaning_str() for x in c} pattern=re.compile(r'\\protected macro:->\\([A-Za-z]+) ?') extra_synonyms_list += [[c, match[1]] for c, m in m.items() if (match:=pattern.fullmatch(m)) ] def same_meaning_control_sequences(meaning: dict[str, str])->list[list[str]]: return [ l for m, l0 in groupby(sorted(c, key=lambda x: meaning[x]), lambda x: meaning[x]) if m!="undefined" for l in [[*l0]] if len(l)>=2 ] extra_synonyms_list += same_meaning_control_sequences(m) # ======== extract amsmath&stix synonyms m_values=[] for preamble in [ r""" \documentclass{article} \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsfonts} \begin{document} """, r""" \documentclass{article} \usepackage{stix} \begin{document} """ ]: with ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"}) as e, default_engine.set_engine(e): execute(preamble) c=control_sequences() m={x: T[x].meaning_str() for x in c} extra_synonyms_list += same_meaning_control_sequences(m) m_values.append(m) [amsmath_meaning, stix_meaning]=m_values # ======== build extra_synonyms table while True: tmp=Counter([x for l in extra_synonyms_list for x in l]) [(item, frequency)]=tmp.most_common(1) if frequency==1: break assert frequency>1 extra_synonyms_list=[ # the group that contain item [*{x for l in extra_synonyms_list if item in l for x in l}] ] + [ # remaining groups l for l in extra_synonyms_list if item not in l] extra_synonyms_list=sorted([sorted(l) for l in {frozenset( item for item in l if item not in ("dotsc", "dotsm", "dotsb", "dots") # some simple filtering -- we will just use \cdots and \ldots ) for l in extra_synonyms_list} if len(l)>1]) # deduplicate tmp=Counter(sum(extra_synonyms_list, [])) assert tmp.most_common()[0][1]==1, tmp extra_synonyms = {v: u for u in extra_synonyms_list for v in u} # ======== check how much of the table is valid on unicode-math/luatex def getdelcode(x: str)->tuple[int, int, int, int]: return tuple(map(int, TokenList([r"\directlua", TokenList.fstr( # type: ignore r""" for _, v in ipairs(tex.getdelcode(""" + str(ord(x)) + r""")) do tex.sprint(v..",") end """ )]).expand_x().str().rstrip(",").split(","))) @functools.lru_cache(maxsize=None) def meaning(csname: str)->str: return T[csname].meaning_str() @functools.lru_cache(maxsize=None) def good_delimiter(meaning: str, ch: str)->bool: math = umathcode[ch] o = ord(ch) if math.family!=0 or math.position!=o: return False a, b, c, d = getdelcode(ch) if a!=0 or b!=o or c!=0 or d!=0: return False other = f'\\protected macro:->\\Udelimiter {math.cls.value}\\symoperators "{o:05X}\\scan_stop: ' return meaning==other specially_handled = { match[1] for match in re.finditer(r'\\__umi_special_handle{(.)}', Path("unicode-math-input.sty").read_text()) } not_handled = {*"⎴⎵⏜⏝⏞⏟⟌\u03a2\U0001d455"} math_alphabet_translate = { "mup" : None, "mbf" : "umiMathbf", "mit" : "umiMathit", "mbfit" : "umiMathbfit", # https://tex.stackexchange.com/questions/14395/bold-italic-vectors "mscr" : "umiMathscr", "mbfscr" : "umiMathbfscr", # https://tex.stackexchange.com/questions/23455/latex-calligraphic-script-bold "mfrak" : "umiMathfrak", "Bbb" : "umiMathbb", "mitBbb" : "umiMathbbit", # https://tex.stackexchange.com/questions/16645/blackboard-italic-font "mbffrak" : "umiMathbffrak", # https://tex.stackexchange.com/questions/610696/may-i-have-bold-mathfraktur "msans" : "umiMathsf", "mbfsans" : "umiMathsfbf", # https://tex.stackexchange.com/questions/340097/bold-sans-serif-math-font "mitsans" : "umiMathsfit", "mbfitsans": "umiMathsfbfit", "mtt" : "umiMathtt", } math_alphabet_translate = dict(sorted(math_alphabet_translate.items(), key=lambda x: -len(x[0]))) # match against longest prefix first math_alphabet_csname_translation = { "alpha": r"\alpha", "Alpha": r"\Alpha", "beta": r"\beta", "Beta": r"\Beta", "chi": r"\chi", "Chi": r"\Chi", "delta": r"\delta", "Delta": r"\Delta", "digamma": r"\digamma", "Digamma": r"\Digamma", "epsilon": r"\epsilon", "Epsilon": r"\Epsilon", "eta": r"\eta", "Eta": r"\Eta", "gamma": r"\gamma", "Gamma": r"\Gamma", "iota": r"\iota", "Iota": r"\Iota", "kappa": r"\kappa", "Kappa": r"\Kappa", "lambda": r"\lambda", "Lambda": r"\Lambda", "mu": r"\mu", "Mu": r"\Mu", "nabla": r"\nabla", "nu": r"\nu", "Nu": r"\Nu", "omega": r"\omega", "Omega": r"\Omega", "omicron": r"\omicron", "Omicron": r"\Omicron", "partial": r"\partial", "phi": r"\phi", "Phi": r"\Phi", "pi": r"\pi", "Pi": r"\Pi", "psi": r"\psi", "Psi": r"\Psi", "rho": r"\rho", "Rho": r"\Rho", "sigma": r"\sigma", "Sigma": r"\Sigma", "sum": r"\sum", "tau": r"\tau", "Tau": r"\Tau", "theta": r"\theta", "Theta": r"\Theta", "upsilon": r"\upsilon", "Upsilon": r"\Upsilon", "varepsilon": r"\varepsilon", "varkappa": r"\varkappa", "varphi": r"\varphi", "varpi": r"\varpi", "varrho": r"\varrho", "varsigma": r"\varsigma", "vartheta": r"\vartheta", "varTheta": r"\varTheta", "xi": r"\xi", "Xi": r"\Xi", "zeta": r"\zeta", "Zeta": r"\Zeta", "a": "a", "A": "A", "b": "b", "B": "B", "c": "c", "C": "C", "d": "d", "D": "D", "e": "e", "E": "E", "f": "f", "F": "F", "g": "g", "G": "G", "h": "h", "H": "H", "i": "i", "I": "I", "j": "j", "J": "J", "k": "k", "K": "K", "l": "l", "L": "L", "m": "m", "M": "M", "n": "n", "N": "N", "o": "o", "O": "O", "p": "p", "P": "P", "q": "q", "Q": "Q", "r": "r", "R": "R", "s": "s", "S": "S", "t": "t", "T": "T", "u": "u", "U": "U", "v": "v", "V": "V", "w": "w", "W": "W", "x": "x", "X": "X", "y": "y", "Y": "Y", "z": "z", "Z": "Z", "zero" : "0", "one" : "1", "two" : "2", "three": "3", "four" : "4", "five" : "5", "six" : "6", "seven": "7", "eight": "8", "nine" : "9", } math_alphabet_redundant_greek = { r"\Alpha" : "A", r"\Beta" : "B", r"\Chi" : "X", r"\Digamma": "F", r"\Epsilon": "E", r"\Eta" : "H", r"\Iota" : "I", r"\Kappa" : "K", r"\Mu" : "M", r"\Nu" : "N", r"\omicron": "o", r"\Omicron": "O", r"\Rho" : "P", r"\Tau" : "T", r"\Zeta" : "Z", } ASCII_symbol_synonym = { "minus": "-", "mid": "|", } ## remaining_chars = changed_chars - {*unicode_math_table} - specially_handled - not_handled remaining_chars = {x for x in remaining_chars if ord(x) >= 0x80} for i in range(ord("!"), ord("~")+1): fullch=chr(0xff00+i-0x20) assert unicodedata.name(fullch) == "FULLWIDTH " + unicodedata.name(chr(i)) if fullch in remaining_chars: remaining_chars.remove(fullch) print(r'\__umi_define_char{' + fullch + r'}{\char'+str(i)+' }') defined_csnames = {x for l in unicode_math_table.values() for x in l} | {*stix_meaning} | {*amsmath_meaning} pdf_engine=ChildProcessEngine("pdftex") with default_engine.set_engine(pdf_engine): execute(r""" \documentclass{article} \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsfonts} \usepackage{mathrsfs} \begin{document} """) def remove_not(a: str)->Optional[str]: global defined_csnames if a in (r"\ni", r"\nu"): return None if a.startswith(r"\not") and a.removeprefix(r"\not") in defined_csnames: return '\\' + a.removeprefix(r"\not") elif a.startswith(r"\n") and a.removeprefix(r"\n") in defined_csnames: return '\\' + a.removeprefix(r"\n") else: return None for unicode_char, csnames_ in unicode_math_table.items(): csnames = [*csnames_] if unicodedata.combining(unicode_char) != 0: if 0: print( repr(unicode_char), f"U+{ord(unicode_char):04X}", unicodedata.name(unicode_char), "mathcode: ", umathcode[unicode_char], {csname: meaning(csname) for csname in csnames}, f" -- good: {good}" if good else "" ) for csname in csnames: assert "Umathaccent" in meaning(csname), (unicode_char, unicodedata.name(unicode_char), csname, meaning(csname)) continue # don't support combining characters is_combining2="COMBINING" in unicodedata.name(unicode_char).split() if is_combining2: for csname in csnames: assert csname in "enclosecircle enclosesquare enclosediamond enclosetriangle".split(), (unicode_char, csname) assert len(csnames)==1 optional_space=" " if is_combining2 else "" if ord(unicode_char) <= 0x7f: continue if unicode_char in specially_handled or unicode_char in not_handled: continue # bad_or_delimiter = [csname for csname in csnames if meaning(csname) != "the character " + unicode_char] delimiter = [csname for csname in bad_or_delimiter if good_delimiter(meaning(csname), unicode_char)] bad = [*{*bad_or_delimiter} - {*delimiter}] is_delimiter = delimiter or getdelcode(unicode_char)!=(-1, 0, 0, 0) # in unicode-math: # the situation with ⟨/langle and ↑/uparrow is different # in both cases the character gets assigned mathcode and delcode so \left⟨ and \left↑ both work # in langle case the macro is defined to be \protected macro:->\Udelimiter 4\symoperators "027E8\scan_stop: # this is because of @@_set_math_open logic which sets the macro like that # (I don't know why it doesn't just do the thing below) # in uparrow case the macro is defined to be "the character ↑" # then delcode is assigned in @@_assign_delcode manually if bad: good = [*set(csnames) - set(bad)] print( repr(unicode_char), f"U+{ord(unicode_char):04X}", unicodedata.name(unicode_char), "mathcode: ", umathcode[unicode_char], "bad: ", {csname: meaning(csname) for csname in bad}, f" -- good: {good}" if good else "" ) assert False, "please specially handle this" else: csnames = [*csnames_] for csname in [*csnames]: if csname in extra_synonyms: csnames+=extra_synonyms[csname] csnames=[*{csname: None for csname in csnames}] items1=[] for csname in csnames: if not is_delimiter: with default_engine.set_engine(pdf_engine): assert "delimiter" not in T[csname].meaning_str(), (unicode_char, csname) # that is the symbol is not a delimiter in pdf_engine either (check is not particularly reliable but okay) for prefix, replacement in math_alphabet_translate.items(): if csname.startswith(prefix): assert csname not in ASCII_symbol_synonym cs = math_alphabet_csname_translation[csname.removeprefix(prefix)] def wrap_in_alphabet_selector(cs: str)->str: if replacement is None: return cs return "\\" + replacement + "{" + cs + "}" if cs in math_alphabet_redundant_greek: items1.append(wrap_in_alphabet_selector( "\\__umi_alternatives_iisafe" + cs + ("" if math_alphabet_redundant_greek[cs].startswith("\\") else " ") + math_alphabet_redundant_greek[cs] )) else: items1.append(wrap_in_alphabet_selector(cs)) break else: items1.append("\\" + csname) if csname in ASCII_symbol_synonym: items1+=ASCII_symbol_synonym[csname] assert items1 if is_delimiter and len(items1)>1: print("Warning: Synonym for delimiter not supported?", unicode_char, delimiter, items1, file=sys.stderr) del items1[1:] if len(items1)==1: a = items1[0] b = remove_not(a) if b is not None: assert not is_delimiter print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\__umi_alternatives_not{a}{b}}}") else: if is_delimiter: print(f"\\__umi_define_char_maybe_delimiter{{{optional_space}{unicode_char}}}{{{a}}}") else: print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{{a}}}") elif len(items1)==2: assert re.fullmatch(r'\\[a-zA-Z]+', items1[0]), items1 assert re.fullmatch(r'\\[a-zA-Z]+|[^a-zA-Z]', items1[1]), items1 b=remove_not(items1[0]) if b is not None: d=remove_not(items1[1]) assert d is not None, items1 print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives_not_two{items1[0]}{items1[1]}{b}{d}}}") else: print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives{items1[0]}{items1[1]}}}") else: assert len(items1)>=3, items1 assert all(remove_not(x) is None for x in items1), items1 assert all(re.fullmatch(r'\\[a-zA-Z]+', c) for c in items1), items1 print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives_m{{{''.join(items1)}}}}}") ## # ======== sys.exit() # ======== part below are draft. default_engine.set_engine(ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"}, autorestart=True)) execute(r'\documentclass{article}\usepackage{unicode-math}\begin{document}') execute(r'\documentclass{article}\usepackage{amsmath,amssymb,amsfonts}\begin{document}') execute(r'\documentclass{article}\usepackage{amsmath}\usepackage{amssymb}\usepackage{amsfonts}\usepackage{tikz}') @functools.lru_cache(maxsize=None) def is_defined(csname: str)->bool: return T[csname].meaning_str()!="undefined" # show distinct items with math alphabet a_=defaultdict(list) for l in unicode_math_table.values(): for csname in l: if csname.startswith(tuple(math_alphabet_translate)): t = csname for prefix in sorted(math_alphabet_translate, key=len, reverse=True): if t.startswith(prefix): t=t.removeprefix(prefix) break a_[t].append(csname) a_ a_.keys() for v in math_alphabet_csname_translation.values(): if v.startswith("\\") and not is_defined(v[1:]): print(v) def is_okay(csname: str)->bool: if is_defined(csname): return True if csname.startswith(tuple(math_alphabet_translate)): return True return False # # print bad ones for unicode_char, csnames_ in unicode_math_table.items(): if ord(unicode_char) >= 0x80 and all( not is_okay(csname) for csname in csnames_ ): print(unicode_char, csnames_) # print okay ones for unicode_char, csnames_ in unicode_math_table.items(): valid_csnames = [ csname for csname in csnames_ if T[csname].meaning_str()!="undefined" ] if ord(unicode_char) >= 0x80 and valid_csnames: print(unicode_char, valid_csnames) T.lsime.meaning_str() "ℝ".encode('u8') BalancedTokenList([T.meaning, Catcode.active("\xe2")]).expand_x(engine=pdf_engine) T["UTFviii@three@octets"].meaning_str(engine=pdf_engine) T["UTFviii@three@octets@combine"].meaning_str(engine=pdf_engine) T["UTF@three@octets@noexpand"].meaning_str(engine=pdf_engine) BalancedTokenList([T.meaning, Catcode.active("\xe2")]).expand_x() test_engine=ChildProcessEngine("pdftex") BalancedTokenList(r"\def\aa{bb}").execute(engine=test_engine) BalancedTokenList(r"\csname\noexpand\aa\endcsname").expand_o(engine=test_engine) # give error BalancedTokenList(r"\csname\string\aa\endcsname").expand_o(engine=test_engine) # \[\aa] as expected