--
-- Copyright (c) 2021-2025 Zeping Lee
-- Released under the MIT license.
-- Repository: https://github.com/zepinglee/citeproc-lua
--

local latex_parser = {}

local latex_data
local markup
local util
local using_luatex, kpse = pcall(require, "kpse")
if using_luatex then
  latex_data = require("citeproc-latex-data")
  markup = require("citeproc-output")
  util = require("citeproc-util")
else
  latex_data = require("citeproc.latex-data")
  markup = require("citeproc.output")
  util = require("citeproc.util")
end

local lpeg = require("lpeg")


-- Convert LaTeX string to Unicode string
function latex_parser.latex_to_unicode(str)
  local ast = latex_parser.latex_grammar:match(str)
  latex_parser.convert_ast_to_unicode(ast)
  local res = latex_parser.to_string(ast)
  return res
end


-- Convert LaTeX string to InlineElements
function latex_parser.latex_to_inlines(str, strict, case_protection)
  local ast = latex_parser.latex_grammar:match(str)
  latex_parser.convert_ast_to_unicode(ast)
  local inlines = latex_parser.convert_tokens_to_inlines(ast, strict, case_protection)
  return inlines
end


-- Convert LaTeX string to string with HTML-like tags
function latex_parser.latex_to_pseudo_html(str, strict, case_protection)
  local ast = latex_parser.latex_grammar:match(str)
  latex_parser.convert_ast_to_unicode(ast)
  local inlines = latex_parser.convert_tokens_to_inlines(ast, strict, case_protection)
  local pseudo_html_format = markup.PseudoHtml:new()
  local res = pseudo_html_format:write_inlines(inlines, {})
  return res
end


-- strict: preserve unkown LaTeX commands
-- case_protection: BibTEX case protection with curly braces


---@param str string
---@param keep_unknown_commands boolean?
---@param case_protection boolean?
---@param check_sentence_case boolean?
---@return string
function latex_parser.latex_to_sentence_case_pseudo_html(str, keep_unknown_commands, case_protection,
    check_sentence_case)
  local ast = latex_parser.latex_grammar:match(str)
  latex_parser.convert_ast_to_unicode(ast)
  local inlines = latex_parser.convert_tokens_to_inlines(ast, keep_unknown_commands, case_protection)
  local pseudo_html_format = markup.PseudoHtml:new()
  pseudo_html_format:convert_sentence_case(inlines, check_sentence_case)
  local res = pseudo_html_format:write_inlines(inlines, {})
  return res
end


-- Convert LaTeX string to CSL rich-text
function latex_parser.latex_to_rich_text(str)
  local ast = latex_parser.latex_grammar:match(str)
  latex_parser.convert_ast_to_unicode(ast)
  local res = latex_parser.convert_ast_to_rich_text(ast)
  return res
end


function latex_parser.get_latex_grammar()
  local P = lpeg.P
  local R = lpeg.R
  local S = lpeg.S
  local C = lpeg.C
  local Ct = lpeg.Ct
  local V = lpeg.V

  ---@diagnostic disable: codestyle-check
  local space = S(" \t\r\n")^0
  local specials = P"~" / util.unicode["no-break space"]
                  --  + P"\\$" / "$"
                  --  + P"\\%" / "%"
                  --  + P"\\&" / "&"
                  --  + P"\\#" / "#"
                  --  + P"\\_" / "_"
                  --  + P"\\{" / "{"
                  --  + P"\\}" / "}"
  local control_sequence = C(P"\\" * (R("AZ", "az")^1 * space + (1 - R("AZ", "az")))) / function (cs)
    return {
      type = "control_sequence",
      name = util.rstrip(cs),
      raw = cs,
    }
  end
  local math = P"$" * C((P"\\$" + 1 - S"$")^0) * P"$" / function (math_text)
    return {
      type = "math",
      text = math_text,
    }
  end
  local ligatures = P"``" / util.unicode["left double quotation mark"]
                    + P"`" / util.unicode["left single quotation mark"]
                    + P"''" / util.unicode["right double quotation mark"]
                    + P"'" / util.unicode["right single quotation mark"]
                    + P"---" / util.unicode["em dash"]
                    + P"--" / util.unicode["en dash"]
  local utf8_char = R("\0\127") + R("\194\244") * R("\128\191")^1
  local plain_text = C(utf8_char - S"{}$\\")
  local latex_grammar = P{
    "latex_text";
    latex_text = Ct((specials + control_sequence + math + ligatures + specials + V"group" + plain_text)^0),
    group = C(P"{" * V"latex_text" * P"}") / function (raw, group_contents)
      return {
        type = "group",
        contents = group_contents,
        raw = raw,
      }
    end,
  }
  ---@diagnostic enable: codestyle-check
  return latex_grammar
end


latex_parser.latex_grammar = latex_parser.get_latex_grammar()


local format_commands_with_argment = {
  ["\\textbf"] = "bold",
  ["\\textit"] = "italic",
  ["\\textsl"] = "italic",
  ["\\emph"] = "italic",
  ["\\enquote"] = "quote",
  ["\\textsc"] = "sc",
  ["\\sout"] = "strike",  -- from ulem package
  ["\\st"] = "strike",  -- from soul package
  ["\\textsuperscript"] = "sup",
  ["\\textsubscript"] = "sub",
}

local format_commands_without_argment = {
  ["\\bf"] = "bold",
  ["\\bfseries"] = "bold",
  ["\\it"] = "italic",
  ["\\itshape"] = "italic",
  ["\\sl"] = "italic",
  ["\\slshape"] = "italic",
  ["\\em"] = "italic",
  ["\\scshape"] = "sc",
  ["\\sc"] = "sc",
}


function latex_parser.to_string(ast)
  local res = ""
  for _, token in ipairs(ast) do
    if type(token) == "string" then
      res = res .. token
    elseif type(token) == "table" then
      if token.type == "control_sequence" then
        res = res .. token.raw
      elseif token.type == "math" then
        res = res .. token.text
      elseif token.type == "group" then
        res = res .. "{" .. latex_parser.to_string(token.contents) .. "}"
      end
    end
  end
  return res
end


local function _remove_braces_of_diacritic(tokens, i, orig_first_token)
  -- Remove braces in "{\'A}"
  local token = tokens[i]
  if #token.contents == 1 then
    local first_token = token.contents[1]
    local prev_token = tokens[i - 1]
    if orig_first_token and type(orig_first_token) == "table"
        and orig_first_token.type == "control_sequence"
        and not (type(prev_token) == "table" and prev_token.type == "control_sequence")
        and type(first_token) == "string" then
      tokens[i] = first_token
    end
  end
end

---@param tokens any
---@param i any
---@param command_info string | table
local function _replace_diacritic_with_unicode(tokens, i, command_info)
  local unicode_char
  if type(command_info) == "string" then
    unicode_char = utf8.char(tonumber(command_info, 16))
    tokens[i] = unicode_char

  elseif type(command_info) == "table" then
    if not command_info.composites then
      error('Missing "composites"')
    end
    -- The command takes an argument (\"{o})
    local arg
    local j = i + 1
    while j <= #tokens and tokens[j] == " " do
      j = j + 1
    end
    if j <= #tokens then
      local next_token = tokens[j]
      if type(next_token) == "string" then
        arg = next_token
      elseif type(next_token) == "table" then
        if next_token.type == "control_sequence" then
          arg = next_token.name
        elseif next_token.type == "group" then
          if #next_token.contents == 0 then
            arg = ""
          elseif #next_token.contents == 1 then
            next_token = next_token.contents[1]
            if type(next_token) == "string" then
              arg = next_token
            elseif type(next_token) == "table" then
              arg = next_token.name
            end
          end
        end
      end
    end
    if arg and command_info.composites[arg] then
      unicode_char = utf8.char(tonumber(command_info.composites[arg], 16))
      tokens[i] = unicode_char
      for k = j, i + 1, -1 do
        table.remove(tokens, k)
      end
    elseif command_info.code_point then
      -- Move the Diacritic after the next token
      unicode_char = utf8.char(tonumber(command_info.code_point, 16))
      for k = i, j - 1 do
        tokens[k] = tokens[k + 1]
      end
      tokens[j] = unicode_char
    end
  end
end

function latex_parser.convert_ast_to_unicode(tokens)
  local res = ""
  local i = 1
  while i <= #tokens do
    local token = tokens[i]
    if type(token) == "table" then
      if token.type == "group" then
        local orig_first_token = token.contents[1]
        latex_parser.convert_ast_to_unicode(token.contents)
        _remove_braces_of_diacritic(tokens, i, orig_first_token)

      elseif token.type == "control_sequence" then
        local cs = token
        local command_info = latex_data.unicode_commands[cs.name]
        if command_info then
          _replace_diacritic_with_unicode(tokens, i, command_info)
        end
      end
    end
    i = i + 1
  end
  return res
end


-- Also return the index to last token read by the cs
function latex_parser.convert_cs_to_inlines(tokens, i, strict, case_protection)
  local token = tokens[i]
  local inlines = {}

  local command_info = latex_data.commands[token.name]
  if command_info then
    local inline_type = command_info.inline_type
    if inline_type then
      local arg_inlines
      if command_info.num_args == 1 then
        local next_token = tokens[i + 1]
        if type(next_token) == "table" and next_token.type == "group" then
          arg_inlines = latex_parser.convert_tokens_to_inlines(next_token.contents, strict, false)
          if case_protection then
            arg_inlines = {markup.NoCase:new(arg_inlines)}
          end
        else
          arg_inlines = latex_parser.convert_tokens_to_inlines({next_token}, strict, case_protection)
        end
        i = i + 1
      else  -- command_info.num_args == 0
        if command_info.inline_type then
          local arg_tokens = util.slice(tokens, i + 1, #tokens)
          i = #tokens
          arg_inlines = latex_parser.convert_tokens_to_inlines(arg_tokens, strict, case_protection)
        end
      end

      if inline_type == "Formatted" then
        local inline = markup.Formatted:new(arg_inlines,
          {[command_info.formatting_key] = command_info.formatting_value})
        table.insert(inlines, inline)
      elseif inline_type == "Quoted" then
        table.insert(inlines, markup.Quoted:new(arg_inlines))
      elseif inline_type == "NoCase" then
        if not case_protection then
          arg_inlines = {markup.NoCase:new(arg_inlines)}
        end
        util.extend(inlines, arg_inlines)
      else
        -- NoCase
        table.insert(inlines, markup[inline_type]:new(arg_inlines))
      end

    elseif command_info.action then
      if command_info.action == "gobble" then
        table.remove(tokens, i)
        for j = 1, command_info.num_args do
          table.remove(tokens, i)
        end
      end
    end

  else
    -- Unrecognized command
    if strict then
      local raw = token.raw
      for j = i + 1, #tokens do
        if type(tokens[j]) == "table" and tokens[j].type == "group" then
          i = j
          raw = raw .. latex_parser.convert_token_to_raw(tokens[j])
        else
          break
        end
      end
      table.insert(inlines, markup.Code:new(raw))
    else
      -- Gobble following groups as pandoc does:
      -- "Foo \unrecognized{bar}{baz} quz" -> "Foo  quz"
      for j = i + 1, #tokens do
        if type(tokens[j]) == "table" and tokens[j].type == "group" then
          i = j
        else
          break
        end
      end
    end
  end

  return inlines, i
end

---@param token any
---@return string
function latex_parser.convert_token_to_raw(token)
  if type(token) == "string" then
    return token
  elseif type(token) == "table" then
    if token.raw then
      return token.raw
    elseif token.type == "group" then
      local raw = "{"
      for _, token_ in ipairs(token.contents) do
        raw = raw .. latex_parser.convert_token_to_raw(token_)
      end
      raw = raw .. "}"
      return raw
    else
      error(token.type)
      return ""
    end
  else
    error(type(token))
    return ""
  end
end

-- TODO: raise a warning for unrecognized LaTeX command like `\switchargs`
function latex_parser.convert_group_to_inlines(token, strict, case_protection, force_add_braces)
  if case_protection then
    local first_token = token.contents[1]
    if type(first_token) == "table" and first_token.type == "control_sequence" then
      -- BibTeX's "special character" like "{\'E}nd""
      case_protection = false
    end
  end
  local inlines = latex_parser.convert_tokens_to_inlines(token.contents, strict, false)
  if #inlines == 0 then
    -- Empty group like `{\\noopsort{1973b}}`
    return inlines
  end
  if case_protection then
    inlines = {markup.NoCase:new(inlines)}
  end
  if strict then
    local has_command = false
    for _, inline in ipairs(inlines) do
      if inline._type == "Code" and util.startswith(inline.value, "\\") then
        has_command = true
        break
      end
    end
    if force_add_braces or has_command then
      table.insert(inlines, 1, markup.Code:new("{"))
      table.insert(inlines, markup.Code:new("}"))
    end
  end
  return inlines
end


---A group is surrounded with braces in these cases:
---1. Following a command; e.g, `\textbf{foo}` and `\textcolor{red}{flag}`
---2. Containing a command; e.g, {\bfseies foo}
---@param tokens table
---@param strict boolean? Convert unrecognized LaTeX command to Code element
---@param case_protection boolean? Protect case with BibTeX's rules
---@return table
function latex_parser.convert_tokens_to_inlines(tokens, strict, case_protection)
  local inlines = {}

  local i = 1
  local add_group_braces = false
  while i <= #tokens do
    local token = tokens[i]
    if type(token) == "string" then
      -- The merge following string tokens
      for j = i + 1, #tokens do
        if type(tokens[j]) == "string" then
          token = token .. tokens[j]
          i = j
        else
          break
        end
      end
      table.insert(inlines, markup.PlainText:new(token))
      add_group_braces = false

    elseif type(token) == "table" then
      if token.type == "control_sequence" then
        local cs_inlines
        cs_inlines, i = latex_parser.convert_cs_to_inlines(tokens, i, strict, case_protection)
        util.extend(inlines, cs_inlines)
        add_group_braces = true

      elseif token.type == "group" then
        util.extend(inlines, latex_parser.convert_group_to_inlines(token, strict, case_protection, add_group_braces))

      elseif token.type == "math" then
        table.insert(inlines, markup.MathTeX:new(token.text))
      end

    end
    i = i + 1
  end

  -- Merge adjacent Code inlines.
  for i = #inlines, 1, -1 do
    if i > 1 then
      local inline = inlines[i]
      local prev = inlines[i - 1]
      if type(inline) == "table" and inline._type == "Code" and
          type(prev) == "table" and prev._type == "Code" then
        prev.value = prev.value .. inline.value
        table.remove(inlines, i)
      end
    end
  end

  return inlines
end


function latex_parser.convert_ast_to_rich_text(tokens)
  local res = {}

  local tmp_str = ""
  local i = 1
  while i <= #tokens do
    local token = tokens[i]
    if type(token) == "string" then
      tmp_str = tmp_str .. token
    elseif type(token) == "table" then
      if tmp_str ~= "" then
        table.insert(res, tmp_str)
        tmp_str = ""
      end
      if token.type == "control_sequence" then
        local format_with_argment = format_commands_with_argment[token.name]
        local format_without_argment = format_commands_without_argment[token.name]
        if format_with_argment then
          if i < #tokens then
            local next_token = tokens[i + 1]
            local rich_text
            if type(next_token) == "string" then
              rich_text = {[format_with_argment] = next_token}
            elseif type(next_token) == "table" then
              if next_token.type == "control_sequence" then
                local content = {code = next_token.raw}
                rich_text = {[format_with_argment] = {content}}
              elseif next_token.type == "group" then
                rich_text = {[format_with_argment] = latex_parser.convert_ast_to_rich_text(next_token.contents)}
              elseif next_token.type == "math" then
                local content = latex_parser.convert_ast_to_rich_text(next_token.contents)
                rich_text = {[format_with_argment] = {{["math-tex"] = content}}}
              end
            end
            if rich_text then
              table.insert(res, rich_text)
              i = i + 1
            end
          else
            table.insert(res, {code = token.raw})
          end

        elseif format_without_argment then
          local rest_tokens = {}
          for j = i + 1, #tokens do
            table.insert(rest_tokens, tokens[j])
          end
          local rich_text = {[format_without_argment] = latex_parser.convert_ast_to_rich_text(rest_tokens)}
          table.insert(res, rich_text)
          i = #tokens

        else
          local rich_text = {code = token.raw}
          table.insert(res, rich_text)

        end

      elseif token.type == "group" then
        table.insert(res, {code = "{"})
        for _, rich_text in ipairs(latex_parser.convert_ast_to_rich_text(token.contents)) do
          table.insert(res, rich_text)
        end
        table.insert(res, {code = "}"})

      elseif token.type == "math" then
        table.insert(res, {["math-tex"] = token.text})
      end

    end
    i = i + 1
  end

  if tmp_str ~= "" then
    table.insert(res, tmp_str)
  end

  -- Merge tokens
  for i = #res - 1, 1, -1 do
    local token = res[i]
    local next_token = res[i + 1]
    local token_type = type(token)
    local next_token_type = type(next_token)
    if token_type == "string" and next_token_type == "string" then
      res[i] = token .. next_token
      table.remove(res, i + 1)
    elseif token_type == "table" and token.code and
        next_token_type == "table" and next_token.code then
      token.code = token.code .. next_token.code
      table.remove(res, i + 1)
    end
  end

  if #res == 1 and type(res[1]) == "string" then
    res = res[1]
  end

  return res
end


function latex_parser.parse_seq(str)
  local P = lpeg.P
  local S = lpeg.S
  local C = lpeg.C
  local Ct = lpeg.Ct
  local V = lpeg.V

  ---@diagnostic disable: codestyle-check
  local balanced = P{"{" * V(1) ^ 0 * "}" + (P"\\{" + P"\\}" + 1 - S"{}")}
  local item = P"{" * C(balanced^0) * P"}" + C((balanced - P",")^1)
  local seq = Ct((item * P(",")^-1)^0)
  ---@diagnostic enable: codestyle-check

  return seq:match(str)
end

function latex_parser.parse_prop(str)
  local P = lpeg.P
  local R = lpeg.R
  local S = lpeg.S
  local C = lpeg.C
  local Cf = lpeg.Cf
  local Cg = lpeg.Cg
  local Ct = lpeg.Ct
  local V = lpeg.V

  ---@diagnostic disable: codestyle-check
  local balanced = P{"{" * V(1)^0 * "}" + (P"\\{" + P"\\}" + 1 - S"{}")}
  local key = (R"09" + R"AZ" + R"az" + S"-_./")^1
  local value = (P"{" * C(balanced^0) * P"}" + C((balanced - S",=")^0)) / function (s)
    if s == "true" then
      return true
    elseif s == "false" then
      return false
    else
      return s
    end
  end
  local space = S(" \t\r\n")^0
  local pair = C(key) * space * P"=" * space * value * (P(",") * space)^-1
  local prop = Cf(Ct"" * space * Cg(pair)^0, rawset)
  ---@diagnostic enable: codestyle-check

  -- if not str then
  --   print(debug.traceback())
  -- end
  return prop:match(str)
end


return latex_parser