#!/bin/python3
"""
This file is not used while TeX is running. It's for generating unicode-math-input-table.tex file only.
This requires pythonimmediate (not sure which version is compatible but
commit 020068db8a966c138b5b0b93695c0fefdef03d0a on Python 3.11.3 is)

To generate: run::
	python3 unicode-math-input-script.py > unicode-math-input-table.tex

How does it work?

The mapping is determined from multiple sources:

* The unicode-math package itself, which defines a "command → Unicode character" mapping.
  This does not always work because different TeX packages may name the command differently.

* Synonym table, obtained by looking at STIX's command definition

* TeX's glyph → unicode mapping (used to facilitate copy-paste in PDF),
  e.g. /usr/share/texmf-dist/tex/generic/pdftex/glyphtounicode.tex
  This should be good, but is currently not used. Furthermore, not all TeX commands are implemented by
  getting a single character from a font...

How does the Unicode mapping work?

First there's the `pdftex.map` file, then there's umsa.cmap for msam10.tfm/afm/pfm/pfb/mf (metafont source file)

/usr/share/texmf-dist/fonts/source/public/amsfonts/symbols/msam10.mf
	/usr/share/texmf-dist/fonts/source/public/amsfonts/symbols/asymbols.mf

/usr/share/texmf-dist/fonts/afm/public/amsfonts/symbols/msam10.afm
→ plaintext-looking file may work

/usr/share/texmf-dist/fonts/tfm/public/amsfonts/symbols/msam10.tfm
/usr/share/texmf-dist/fonts/type1/public/amsfonts/symbols/msam10.pfm
/usr/share/texmf-dist/fonts/type1/public/amsfonts/symbols/msam10.pfb

The glyphtounicode.tex may be a bit problematic...
	https://tex.stackexchange.com/questions/66300/how-to-fix-missing-or-incorrect-mappings-from-glyphtounicode-tex

See also: section 3.2 How to find a table of correspondences? in https://tex.stackexchange.com/a/628285/250119

"""

from __future__ import annotations

from pythonimmediate.engine import ChildProcessEngine
from pythonimmediate.engine import default_engine
from pythonimmediate import*
import pythonimmediate
from collections import defaultdict, Counter
from itertools import groupby
import os
import json
import subprocess
import re
import sys
import unicodedata
import functools
from dataclasses import dataclass

# ========

print(r'% This file is automatically generated from unicode-math-input-script.py.')


# ======== start a luatex engine
# https://tex.stackexchange.com/questions/574607/tex-hashtokens-incomplete
default_engine.set_engine(ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"}))


"""
from the TeXbook: 

(INITEX starts out with
\mathcode x = x for all characters x that are neither letters nor digits. The ten digits
have \mathcode x = x+"7000; the 52 letters have \mathcode x = x+"7100.)
"""
# ======== reset all mathcode to 0
TokenList([r"\directlua", TokenList.fstr(
r"""
for i=0, 0x10ffff do
	tex.setmathcode(i, {0, 0, 0})
end
"""
)]).execute()

# ======== load unicode-math
execute(r'''
\documentclass{article}
\usepackage{unicode-math}
\begin{document}
''')

# ======== print changed mathcodes (we aim to support all of these)
changed_mathcodes = TokenList([r"\directlua", TokenList.fstr(
r"""
for i=0, 0x10ffff do
	local cls, family, pos=table.unpack(tex.getmathcode(i))
	if not (
		--(cls==0 and family==0 and pos==i) or (cls==7 and family==1 and pos==i) 
		(cls==0 and family==0 and pos==0)
	) then
		tex.print(-2, i .. ":"..utf8.char(i)..":" .. cls..' '..family..' '..pos .. "\n")
	end
end
"""
)]).expand_x().str()

changed_chars: set[str] = set()
for line in changed_mathcodes.splitlines():
	match = re.fullmatch(r'(\d+):(.):(\d+) (\d+) (\d+)', line)
	assert match
	unicode_char = match[2]
	assert match[2]==chr(int(match[1])), match
	if match[3]=="8" and match[4]=="0" and match[5]=="0":
		code = Umathcode.active
	else:
		code = Umathcode(int(match[4]), MathClass.lookup(int(match[3])), int(match[5]))
	changed_chars.add(unicode_char)

# ======== parse the unicode math table

path = subprocess.run(["kpsewhich", "unicode-math-table.tex"], stdout=subprocess.PIPE).stdout
lines = Path(path.decode('u8').strip('\n')).read_text().splitlines()
lines = [line for line in lines if line and not line.startswith("%")]
unicode_math_table_=defaultdict(list)
for line in lines:
	match = re.fullmatch(r'\\UnicodeMathSymbol{"(.*)}{\\(.*?) *}{\\math(.*)}{(.*)}%', line)
	assert match
	unicode_char=chr(int(match[1], 16))
	csname=match[2]
	#unicode_math_table_.append(Item(unicode_char=unicode_char, csname))
	unicode_math_table_[unicode_char].append(csname)
unicode_math_table={unicode_char: tuple(csnames) for unicode_char, csnames in unicode_math_table_.items()}

# ======== extract unicode-math synonyms

def control_sequences()->list[str]:
	return (lua_try_eval(r"""
	do
		local s={}
		for k, v in pairs(tex.hashtokens()) do
			if v:find("^[A-Za-z]+$") then
				s[v]=0
			end
		end
		local t={}
		for v, _ in pairs(s) do table.insert(t, v) end
		return table.concat(t, "\x00")
	end
	""") or "").split("\x00")

extra_synonyms_list: list[list[str]] = [
					  ["adots", "iddots"],
					  ["unicodecdots", "cdots"], # https://github.com/wspr/unicode-math/issues/571
					  ["unicodeellipsis", "ldots"],
					  #["llbracket", "lBrack"],
					  #["rrbracket", "rBrack"],
					  ]


c=control_sequences()
m={x: T[x].meaning_str() for x in c}

pattern=re.compile(r'\\protected macro:->\\([A-Za-z]+) ?')

extra_synonyms_list += [[c, match[1]] for c, m in m.items()
 if (match:=pattern.fullmatch(m))
 ]

def same_meaning_control_sequences(meaning: dict[str, str])->list[list[str]]:
	return [
		l
		for m, l0 in groupby(sorted(c, key=lambda x: meaning[x]), lambda x: meaning[x])
		if m!="undefined"
		for l in [[*l0]]
		if len(l)>=2
		]

extra_synonyms_list += same_meaning_control_sequences(m)

# ======== extract amsmath&stix synonyms


m_values=[]
for preamble in [
r"""
\documentclass{article}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\begin{document}
""",
r"""
\documentclass{article}
\usepackage{stix}
\begin{document}
"""
]:
	with ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"}) as e, default_engine.set_engine(e):
		execute(preamble)
		c=control_sequences()
		m={x: T[x].meaning_str() for x in c}
		extra_synonyms_list += same_meaning_control_sequences(m)
		m_values.append(m)
[amsmath_meaning, stix_meaning]=m_values
# ======== build extra_synonyms table

while True:
	tmp=Counter([x for l in extra_synonyms_list for x in l])
	[(item, frequency)]=tmp.most_common(1)
	if frequency==1: break
	assert frequency>1
	extra_synonyms_list=[
			# the group that contain item
			[*{x for l in extra_synonyms_list if item in l for x in l}]
			] + [
					# remaining groups
					l for l in extra_synonyms_list if item not in l]
	

extra_synonyms_list=sorted([sorted(l) for l in {frozenset(
	item for item in l
	if item not in ("dotsc", "dotsm", "dotsb", "dots")  # some simple filtering -- we will just use \cdots and \ldots
	) for l in extra_synonyms_list} if len(l)>1]) # deduplicate

tmp=Counter(sum(extra_synonyms_list, []))
assert tmp.most_common()[0][1]==1, tmp

extra_synonyms = {v: u for u in extra_synonyms_list for v in u}

# ======== check how much of the table is valid on unicode-math/luatex

def getdelcode(x: str)->tuple[int, int, int, int]:
	return tuple(map(int, TokenList([r"\directlua", TokenList.fstr(  # type: ignore
		r"""
		for _, v in ipairs(tex.getdelcode(""" + str(ord(x)) + r""")) do tex.sprint(v..",") end
		"""
		)]).expand_x().str().rstrip(",").split(",")))

@functools.lru_cache(maxsize=None)
def meaning(csname: str)->str:
	return T[csname].meaning_str()

@functools.lru_cache(maxsize=None)
def good_delimiter(meaning: str, ch: str)->bool:
	math = umathcode[ch]
	o = ord(ch)
	if math.family!=0 or math.position!=o: return False
	a, b, c, d = getdelcode(ch)
	if a!=0 or b!=o or c!=0 or d!=0: return False
	other = f'\\protected macro:->\\Udelimiter {math.cls.value}\\symoperators "{o:05X}\\scan_stop: '
	return meaning==other

specially_handled = {
		match[1] for match in 
		re.finditer(r'\\__umi_special_handle{(.)}', Path("unicode-math-input.sty").read_text())
		}

not_handled = {*"⎴⎵⏜⏝⏞⏟⟌\u03a2\U0001d455"}

math_alphabet_translate = {
		"mup"      : None,
		"mbf"      : "umiMathbf",
		"mit"      : "umiMathit",
		"mbfit"    : "umiMathbfit",    # https://tex.stackexchange.com/questions/14395/bold-italic-vectors
		"mscr"     : "umiMathscr",
		"mbfscr"   : "umiMathbfscr",   # https://tex.stackexchange.com/questions/23455/latex-calligraphic-script-bold
		"mfrak"    : "umiMathfrak",
		"Bbb"      : "umiMathbb",
		"mitBbb"   : "umiMathbbit",    # https://tex.stackexchange.com/questions/16645/blackboard-italic-font
		"mbffrak"  : "umiMathbffrak",  # https://tex.stackexchange.com/questions/610696/may-i-have-bold-mathfraktur
		"msans"    : "umiMathsf",
		"mbfsans"  : "umiMathsfbf",    # https://tex.stackexchange.com/questions/340097/bold-sans-serif-math-font
		"mitsans"  : "umiMathsfit",
		"mbfitsans": "umiMathsfbfit",
		"mtt"      : "umiMathtt",
		}
math_alphabet_translate = dict(sorted(math_alphabet_translate.items(), key=lambda x: -len(x[0])))  # match against longest prefix first
math_alphabet_csname_translation = {
	"alpha": r"\alpha",
	"Alpha": r"\Alpha",
	"beta": r"\beta",
	"Beta": r"\Beta",
	"chi": r"\chi",
	"Chi": r"\Chi",
	"delta": r"\delta",
	"Delta": r"\Delta",
	"digamma": r"\digamma",
	"Digamma": r"\Digamma",
	"epsilon": r"\epsilon",
	"Epsilon": r"\Epsilon",
	"eta": r"\eta",
	"Eta": r"\Eta",
	"gamma": r"\gamma",
	"Gamma": r"\Gamma",
	"iota": r"\iota",
	"Iota": r"\Iota",
	"kappa": r"\kappa",
	"Kappa": r"\Kappa",
	"lambda": r"\lambda",
	"Lambda": r"\Lambda",
	"mu": r"\mu",
	"Mu": r"\Mu",
	"nabla": r"\nabla",
	"nu": r"\nu",
	"Nu": r"\Nu",
	"omega": r"\omega",
	"Omega": r"\Omega",
	"omicron": r"\omicron",
	"Omicron": r"\Omicron",
	"partial": r"\partial",
	"phi": r"\phi",
	"Phi": r"\Phi",
	"pi": r"\pi",
	"Pi": r"\Pi",
	"psi": r"\psi",
	"Psi": r"\Psi",
	"rho": r"\rho",
	"Rho": r"\Rho",
	"sigma": r"\sigma",
	"Sigma": r"\Sigma",
	"sum": r"\sum",
	"tau": r"\tau",
	"Tau": r"\Tau",
	"theta": r"\theta",
	"Theta": r"\Theta",
	"upsilon": r"\upsilon",
	"Upsilon": r"\Upsilon",
	"varepsilon": r"\varepsilon",
	"varkappa": r"\varkappa",
	"varphi": r"\varphi",
	"varpi": r"\varpi",
	"varrho": r"\varrho",
	"varsigma": r"\varsigma",
	"vartheta": r"\vartheta",
	"varTheta": r"\varTheta",
	"xi": r"\xi",
	"Xi": r"\Xi",
	"zeta": r"\zeta",
	"Zeta": r"\Zeta",
	"a": "a",
	"A": "A",
	"b": "b",
	"B": "B",
	"c": "c",
	"C": "C",
	"d": "d",
	"D": "D",
	"e": "e",
	"E": "E",
	"f": "f",
	"F": "F",
	"g": "g",
	"G": "G",
	"h": "h",
	"H": "H",
	"i": "i",
	"I": "I",
	"j": "j",
	"J": "J",
	"k": "k",
	"K": "K",
	"l": "l",
	"L": "L",
	"m": "m",
	"M": "M",
	"n": "n",
	"N": "N",
	"o": "o",
	"O": "O",
	"p": "p",
	"P": "P",
	"q": "q",
	"Q": "Q",
	"r": "r",
	"R": "R",
	"s": "s",
	"S": "S",
	"t": "t",
	"T": "T",
	"u": "u",
	"U": "U",
	"v": "v",
	"V": "V",
	"w": "w",
	"W": "W",
	"x": "x",
	"X": "X",
	"y": "y",
	"Y": "Y",
	"z": "z",
	"Z": "Z",
	"zero" : "0",
	"one"  : "1",
	"two"  : "2",
	"three": "3",
	"four" : "4",
	"five" : "5",
	"six"  : "6",
	"seven": "7",
	"eight": "8",
	"nine" : "9",
	}

math_alphabet_redundant_greek = {
	r"\Alpha"  : "A",
	r"\Beta"   : "B",
	r"\Chi"    : "X",
	r"\Digamma": "F",
	r"\Epsilon": "E",
	r"\Eta"    : "H",
	r"\Iota"   : "I",
	r"\Kappa"  : "K",
	r"\Mu"     : "M",
	r"\Nu"     : "N",
	r"\omicron": "o",
	r"\Omicron": "O",
	r"\Rho"    : "P",
	r"\Tau"    : "T",
	r"\Zeta"   : "Z",
	}


ASCII_symbol_synonym = {
		"minus": "-",
		"mid": "|",
		}

##

remaining_chars = changed_chars - {*unicode_math_table} - specially_handled - not_handled
remaining_chars = {x for x in remaining_chars if ord(x) >= 0x80}

for i in range(ord("!"), ord("~")+1):
	fullch=chr(0xff00+i-0x20)
	assert unicodedata.name(fullch) == "FULLWIDTH " + unicodedata.name(chr(i))
	if fullch in remaining_chars: remaining_chars.remove(fullch)
	print(r'\__umi_define_char{' + fullch + r'}{\char'+str(i)+' }')

defined_csnames = {x for l in unicode_math_table.values() for x in l} | {*stix_meaning} | {*amsmath_meaning}


pdf_engine=ChildProcessEngine("pdftex")
with default_engine.set_engine(pdf_engine): execute(r"""
\documentclass{article}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{mathrsfs}
\begin{document}
""")

def remove_not(a: str)->Optional[str]:
	global defined_csnames
	if a in (r"\ni", r"\nu"): return None
	if a.startswith(r"\not") and a.removeprefix(r"\not") in defined_csnames:
		return '\\' + a.removeprefix(r"\not")
	elif a.startswith(r"\n") and a.removeprefix(r"\n") in defined_csnames:
		return '\\' + a.removeprefix(r"\n")
	else: return None

for unicode_char, csnames_ in unicode_math_table.items():
	csnames = [*csnames_]
	if unicodedata.combining(unicode_char) != 0:
		if 0:
			print(
					repr(unicode_char),
					f"U+{ord(unicode_char):04X}",
					unicodedata.name(unicode_char),
					"mathcode: ", umathcode[unicode_char],
					{csname: meaning(csname) for csname in csnames},
					f" -- good: {good}" if good else ""
					)
		for csname in csnames:
			assert "Umathaccent" in meaning(csname), (unicode_char, unicodedata.name(unicode_char), csname, meaning(csname))
		continue # don't support combining characters

	is_combining2="COMBINING" in unicodedata.name(unicode_char).split()
	if is_combining2:
		for csname in csnames:
			assert csname in "enclosecircle enclosesquare enclosediamond enclosetriangle".split(), (unicode_char, csname)
		assert len(csnames)==1

	optional_space=" " if is_combining2 else ""

	if ord(unicode_char) <= 0x7f: continue
	if unicode_char in specially_handled or unicode_char in not_handled: continue
	#
	bad_or_delimiter = [csname for csname in csnames if meaning(csname) != "the character " + unicode_char]
	delimiter = [csname for csname in bad_or_delimiter if good_delimiter(meaning(csname), unicode_char)]
	bad = [*{*bad_or_delimiter} - {*delimiter}]
	is_delimiter = delimiter or getdelcode(unicode_char)!=(-1, 0, 0, 0)
	# in unicode-math:
	# the situation with ⟨/langle and ↑/uparrow is different
	# in both cases the character gets assigned mathcode and delcode so \left⟨ and \left↑ both work
	# in langle case the macro is defined to be \protected macro:->\Udelimiter 4\symoperators "027E8\scan_stop:
	#   this is because of @@_set_math_open logic which sets the macro like that
	#   (I don't know why it doesn't just do the thing below)
	# in uparrow case the macro is defined to be "the character ↑"
	#   then delcode is assigned in @@_assign_delcode manually
	if bad:
		good = [*set(csnames) - set(bad)]
		print(
				repr(unicode_char),
				f"U+{ord(unicode_char):04X}",
				unicodedata.name(unicode_char),
				"mathcode: ", umathcode[unicode_char],
				"bad: ", {csname: meaning(csname) for csname in bad},
				f" -- good: {good}" if good else ""
		)
		assert False, "please specially handle this"
	else:
		csnames = [*csnames_]
		for csname in [*csnames]:
			if csname in extra_synonyms:
				csnames+=extra_synonyms[csname]
		csnames=[*{csname: None for csname in csnames}]

		items1=[]
		for csname in csnames:
			if not is_delimiter:
				with default_engine.set_engine(pdf_engine):
					assert "delimiter" not in T[csname].meaning_str(), (unicode_char, csname)
				# that is the symbol is not a delimiter in pdf_engine either (check is not particularly reliable but okay)

			for prefix, replacement in math_alphabet_translate.items():
				if csname.startswith(prefix):
					assert csname not in ASCII_symbol_synonym
					cs = math_alphabet_csname_translation[csname.removeprefix(prefix)]
					def wrap_in_alphabet_selector(cs: str)->str:
						if replacement is None: return cs
						return "\\" + replacement + "{" + cs + "}"

					if cs in math_alphabet_redundant_greek:
						items1.append(wrap_in_alphabet_selector(
							"\\__umi_alternatives_iisafe" + cs + ("" if math_alphabet_redundant_greek[cs].startswith("\\") else " ") + math_alphabet_redundant_greek[cs]
							))
					else:
						items1.append(wrap_in_alphabet_selector(cs))
					break
			else:
				items1.append("\\" + csname)
				if csname in ASCII_symbol_synonym: items1+=ASCII_symbol_synonym[csname]

		assert items1
		if is_delimiter and len(items1)>1:
			print("Warning: Synonym for delimiter not supported?", unicode_char, delimiter, items1, file=sys.stderr)
			del items1[1:]

		if len(items1)==1:
			a = items1[0]
			b = remove_not(a)
			if b is not None:
				assert not is_delimiter
				print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\__umi_alternatives_not{a}{b}}}")
			else:
				if is_delimiter:
					print(f"\\__umi_define_char_maybe_delimiter{{{optional_space}{unicode_char}}}{{{a}}}")
				else:
					print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{{a}}}")
		elif len(items1)==2:
			assert re.fullmatch(r'\\[a-zA-Z]+', items1[0]), items1
			assert re.fullmatch(r'\\[a-zA-Z]+|[^a-zA-Z]', items1[1]), items1
			b=remove_not(items1[0])
			if b is not None:
				d=remove_not(items1[1])
				assert d is not None, items1
				print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives_not_two{items1[0]}{items1[1]}{b}{d}}}")
			else:
				print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives{items1[0]}{items1[1]}}}")
		else:
			assert len(items1)>=3, items1
			assert all(remove_not(x) is None for x in items1), items1
			assert all(re.fullmatch(r'\\[a-zA-Z]+', c) for c in items1), items1
			print(f"\\__umi_define_char{{{optional_space}{unicode_char}}}{{\\__umi_alternatives_m{{{''.join(items1)}}}}}")

##

# ========

sys.exit()

# ========  part below are draft.

default_engine.set_engine(ChildProcessEngine("luatex", env={**os.environ, "hash_extra": "0"}, autorestart=True))
execute(r'\documentclass{article}\usepackage{unicode-math}\begin{document}')

execute(r'\documentclass{article}\usepackage{amsmath,amssymb,amsfonts}\begin{document}')

execute(r'\documentclass{article}\usepackage{amsmath}\usepackage{amssymb}\usepackage{amsfonts}\usepackage{tikz}')


@functools.lru_cache(maxsize=None)
def is_defined(csname: str)->bool:
	return T[csname].meaning_str()!="undefined"


# show distinct items with math alphabet
a_=defaultdict(list)
for l in unicode_math_table.values():
	for csname in l:
		if csname.startswith(tuple(math_alphabet_translate)):
			t = csname
			for prefix in sorted(math_alphabet_translate, key=len, reverse=True):
				if t.startswith(prefix):
					t=t.removeprefix(prefix)
					break
			a_[t].append(csname)
a_

a_.keys()

for v in math_alphabet_csname_translation.values():
	if v.startswith("\\") and not is_defined(v[1:]):
		print(v)


def is_okay(csname: str)->bool:
	if is_defined(csname): return True
	if csname.startswith(tuple(math_alphabet_translate)): return True
	return False
#
# print bad ones
for unicode_char, csnames_ in unicode_math_table.items():
	if ord(unicode_char) >= 0x80 and all( not is_okay(csname) for csname in csnames_ ):
		print(unicode_char, csnames_)


# print okay ones
for unicode_char, csnames_ in unicode_math_table.items():
	valid_csnames = [ csname for csname in csnames_ if T[csname].meaning_str()!="undefined" ]
	if ord(unicode_char) >= 0x80 and valid_csnames:
		print(unicode_char, valid_csnames)

T.lsime.meaning_str()

"ℝ".encode('u8')

BalancedTokenList([T.meaning, Catcode.active("\xe2")]).expand_x(engine=pdf_engine)

T["UTFviii@three@octets"].meaning_str(engine=pdf_engine)

T["UTFviii@three@octets@combine"].meaning_str(engine=pdf_engine)

T["UTF@three@octets@noexpand"].meaning_str(engine=pdf_engine)

BalancedTokenList([T.meaning, Catcode.active("\xe2")]).expand_x()

test_engine=ChildProcessEngine("pdftex")
BalancedTokenList(r"\def\aa{bb}").execute(engine=test_engine)
BalancedTokenList(r"\csname\noexpand\aa\endcsname").expand_o(engine=test_engine)  # give error
BalancedTokenList(r"\csname\string\aa\endcsname").expand_o(engine=test_engine)  # \[\aa] as expected