/** * Authors: Frederik Leyvraz, David Degenhardt * License: GNU General Public License v3.0 only * Version: 1.0.0 */ package ch.bfh.ti.latexindexer; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class PandocParser implements Parser { private String latexFilePath; private final String PANDOC = "pandoc"; private final String PANDOC_OPTIONS = "--to=plain"; // private final String PANDOC_LUA_FILTER = "src/main/resources/frequency-filter.lua"; private final String LEADING_TRAILING_SYMBOLS = "^[^a-zA-Z]+|[^a-zA-Z]+$"; /** * Constructor * @param latexFile The path to the latex file that should be parsed. */ public PandocParser(String latexFile) { this.latexFilePath = latexFile; } /** * Checks the version of the parser * @throws IOException If the parser is not present on the system. */ public void checkVersion() throws IOException { String output; Process pandoc = new ProcessBuilder(PANDOC, "-v").start(); BufferedReader input = new BufferedReader(new InputStreamReader(pandoc.getInputStream())); while ((output = input.readLine()) != null) { System.err.println(output); } input.close(); } @Override public List parseDocument() throws IOException { Map words = new HashMap<>(); checkVersion(); File latexFile = new File(latexFilePath); ProcessBuilder processBuilder = new ProcessBuilder(PANDOC, PANDOC_OPTIONS, latexFile.getAbsolutePath()); processBuilder.directory(latexFile.getParentFile()); try { Process process = processBuilder.start(); BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream())); String line; while ((line = reader.readLine()) != null) { String[] wordsOfLine = line.split("\\W+"); for (String word : wordsOfLine) { word = word.replaceAll(LEADING_TRAILING_SYMBOLS, ""); if (!word.isEmpty()) { words.computeIfPresent(word, (k, v) -> v + 1); words.putIfAbsent(word, 1); } } } BufferedReader error = new BufferedReader(new InputStreamReader(process.getErrorStream())); while ((line = error.readLine()) != null) { System.err.println(line); } reader.close(); int exitCode = process.waitFor(); if (exitCode != 0) { System.err.println("[ERROR] Pandoc exited with code: " + exitCode); } } catch (InterruptedException e) { throw new IOException("Error running Pandoc", e); } List result = new ArrayList<>(); for (String word : words.keySet()) { result.add(new Word(word, words.get(word))); } return result; } }