/** * Authors: Frederik Leyvraz, David Degenhardt * License: GNU General Public License v3.0 only * Version: 1.0.0 */ package ch.bfh.ti.latexindexer; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Comparator; import java.util.List; public class DetexParser implements Parser { private String latexFilePath; private final String DETEX = "detex"; private final String DETEX_OPTIONS = "-w"; // private final String PANDOC_LUA_FILTER = "src/main/resources/frequency-filter.lua"; private final String LEADING_TRAILING_SYMBOLS = "^[^a-zA-Z]+|[^a-zA-Z]+$"; /** * Constructor * @param latexFile The path to the latex file that should be parsed. */ public DetexParser(String latexFile) { this.latexFilePath = latexFile; } /** * Checks the version of the parser * @throws IOException If the parser is not present on the system. */ public void checkVersion() throws IOException { String output; Process detex = new ProcessBuilder(DETEX, "-v").start(); BufferedReader input = new BufferedReader(new InputStreamReader(detex.getInputStream())); while ((output = input.readLine()) != null) { System.err.println(output); } input.close(); } @Override public List parseDocument() throws IOException { List words = new ArrayList<>(); checkVersion(); File latexFile = new File(latexFilePath); ProcessBuilder processBuilder = new ProcessBuilder(DETEX, DETEX_OPTIONS, latexFile.getAbsolutePath()); processBuilder.directory(latexFile.getParentFile()); processBuilder.redirectErrorStream(true); Process process = processBuilder.start(); BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream())); String line; while ((line = reader.readLine()) != null) { line = line.trim(); line = line.replaceAll(LEADING_TRAILING_SYMBOLS, ""); Word word = new Word(line); if (!words.contains(word)) { words.add(word); } } words.sort(new Word.AlphabeticalComparator()); reader.close(); return words; } }