package edu.stanford.nlp.international.spanish.pipeline;

import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.parser.nndep.Config;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.AddNode;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Interval;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/international/spanish/pipeline/MultiWordPreprocessor.class */
public final class MultiWordPreprocessor {
    private static int nMissingPOS;
    private static int nMissingPhrasal;
    private static int nFixedPOS;
    private static int nFixedPhrasal;
    private static final SpanishVerbStripper verbStripper;
    private static final Map<String, Integer> argOptionDefs;
    private static Redwood.RedwoodChannels log = Redwood.channels(MultiWordPreprocessor.class);
    private static final Map<String, String> phrasalCategoryMap = new HashMap();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/international/spanish/pipeline/MultiWordPreprocessor$ManualUWModel.class */
    public static class ManualUWModel {
        private static final Map<String, String> posMap = new HashMap();
        private static int nUnknownWordTypes;
        private static final Pattern digit;
        private static final Pattern participle;
        private static final Set<String> actuallyNames;
        private static final Pattern otherNamePattern;
        private static final Pattern otherNamePattern2;
        private static final Pattern pPronounDeterminers;
        private static final Pattern commonPattern;

        private ManualUWModel() {
        }

        public static String getOverrideTag(String str, String str2) {
            if (str2 == null) {
                return null;
            }
            if (str.equalsIgnoreCase("este") && !str2.startsWith(str)) {
                return "np00000";
            }
            if (str.equals("contra") && (str2.startsWith("en contra") || str2.startsWith("En contra"))) {
                return "nc0s000";
            }
            if (str.equals("total") && str2.startsWith("ese")) {
                return "nc0s000";
            }
            if (str.equals("DEL")) {
                return "sp000";
            }
            if ((str.equals("sí") && str2.contains("por sí")) || str2.contains("fuera de sí")) {
                return "pp000000";
            }
            if (pPronounDeterminers.matcher(str).matches() && str2.endsWith(str)) {
                return "pi000000";
            }
            if (str.equals("cuando") && str2.endsWith(str)) {
                return "pi000000";
            }
            if (str.equalsIgnoreCase("contra") && str2.endsWith(str)) {
                return "nc0s000";
            }
            if (str.equals("salvo") && str2.endsWith("salvo")) {
                return "aq0000";
            }
            if (str.equals("mira") && str2.endsWith(str)) {
                return "nc0s000";
            }
            if (str.equals("pro") && str2.startsWith("en pro")) {
                return "nc0s000";
            }
            if (str.equals("espera") && str2.endsWith("espera de")) {
                return "nc0s000";
            }
            if (str.equals("Paso") && str2.equals("El Paso")) {
                return "np00000";
            }
            if (str.equals("medio") && (str2.endsWith("medio de") || str2.endsWith("ambiente") || str2.endsWith("por medio") || str2.contains("por medio") || str2.endsWith("medio"))) {
                return "nc0s000";
            }
            if (str.equals("Medio") && str2.contains("Ambiente")) {
                return "nc0s000";
            }
            if (str.equals("Medio") && str2.equals("Oriente Medio")) {
                return "aq0000";
            }
            if (str.equals("media") && str2.equals("mass media")) {
                return "nc0n000";
            }
            if (str.equals("cuenta")) {
                return "nc0s000";
            }
            if (str.equals("h") && str2.startsWith("km")) {
                return "zu";
            }
            if (str.equals("A") && (str2.contains("-") || str2.contains(",") || otherNamePattern2.matcher(str2).find() || str2.equals("terminal A"))) {
                return "np00000";
            }
            if (str.equals("forma") && str2.startsWith("forma parte")) {
                return "vmip000";
            }
            if (str.equals("Sin") && str2.contains("Jaime")) {
                return "np00000";
            }
            if (str.equals("di") && str2.contains("di cuenta")) {
                return "vmis000";
            }
            if (str.equals("demos") && str2.contains("demos cuenta")) {
                return "vmsp000";
            }
            if ((str.equals("van") || str.equals("den")) && str2.contains("van den")) {
                return "np00000";
            }
            if (str.equals("Al")) {
                return otherNamePattern.matcher(str2).find() ? "np00000" : "sp000";
            }
            if (actuallyNames.contains(str)) {
                return "np00000";
            }
            if ((str.equals("sino") && str2.endsWith(str)) || str.equals("mañana") || str.equals("paso") || str.equals("monta") || str.equals("deriva") || str.equals("visto")) {
                return "nc0s000";
            }
            if (str.equals("frente") && str2.startsWith("al frente")) {
                return "nc0s000";
            }
            return null;
        }

        public static String getTag(String str, String str2) {
            return str.equals("%") ? "ft" : str.equals(ATBTreeUtils.morphBoundary) ? "fz" : (str.equals("&") || str.equals("@")) ? "f0" : digit.matcher(str).find() ? "z0" : posMap.containsKey(str) ? posMap.get(str) : participle.matcher(str).find() ? "aq0000" : commonPattern.matcher(str).matches() ? "ncms000" : "np00000";
        }

        static {
            posMap.put("cúbico", "aq0000");
            posMap.put("cúbicos", "aq0000");
            posMap.put("diagonal", "aq0000");
            posMap.put("diestro", "aq0000");
            posMap.put("llevados", "aq0000");
            posMap.put("llevadas", "aq0000");
            posMap.put("menudo", "aq0000");
            posMap.put("obstante", "aq0000");
            posMap.put("rapadas", "aq0000");
            posMap.put("rasa", "aq0000");
            posMap.put("súbito", "aq0000");
            posMap.put("temática", "aq0000");
            posMap.put("tuya", "px000000");
            posMap.put("alter", "nc0s000");
            posMap.put("ego", "nc0s000");
            posMap.put("Jet", "nc0s000");
            posMap.put("lag", "nc0s000");
            posMap.put("line", "nc0s000");
            posMap.put("lord", "nc0s000");
            posMap.put("model", "nc0s000");
            posMap.put("mortem", "nc0s000");
            posMap.put("pater", "nc0s000");
            posMap.put("pipe", "nc0s000");
            posMap.put("play", "nc0s000");
            posMap.put("pollastre", "nc0s000");
            posMap.put("post", "nc0s000");
            posMap.put("power", "nc0s000");
            posMap.put("priori", "nc0s000");
            posMap.put("rock", "nc0s000");
            posMap.put("roll", "nc0s000");
            posMap.put("salubritatis", "nc0s000");
            posMap.put("savoir", "nc0s000");
            posMap.put("service", "nc0s000");
            posMap.put("status", "nc0s000");
            posMap.put("stem", "nc0s000");
            posMap.put("street", "nc0s000");
            posMap.put("task", "nc0s000");
            posMap.put("trio", "nc0s000");
            posMap.put("zigzag", "nc0s000");
            posMap.put("mass", "nc0n000");
            posMap.put("media", "nc0n000");
            posMap.put("options", "nc0p000");
            posMap.put("regañadientes", "nc0n000");
            posMap.put("sabiendas", "nc0n000");
            posMap.put("virgen", "nc0s000");
            posMap.put("merced", "ncfs000");
            posMap.put("miel", "ncfs000");
            posMap.put("torera", "ncfs000");
            posMap.put("ultranza", "ncfs000");
            posMap.put("vísperas", "ncfs000");
            posMap.put("acecho", "ncms000");
            posMap.put("alzamiento", "ncms000");
            posMap.put("bordo", "ncms000");
            posMap.put("cápita", "ncms000");
            posMap.put("ciento", "ncms000");
            posMap.put("cuño", "ncms000");
            posMap.put("pairo", "ncms000");
            posMap.put("pese", "ncms000");
            posMap.put("pique", "ncms000");
            posMap.put(Annotator.STANFORD_POS, "ncms000");
            posMap.put("postre", "ncms000");
            posMap.put("pro", "ncms000");
            posMap.put("ralentí", "ncms000");
            posMap.put("ras", "ncms000");
            posMap.put("rebato", "ncms000");
            posMap.put("torno", "ncms000");
            posMap.put("través", "ncms000");
            posMap.put("creces", "ncfp000");
            posMap.put("cuestas", "ncfp000");
            posMap.put("oídas", "ncfp000");
            posMap.put("tientas", "ncfp000");
            posMap.put("trizas", "ncfp000");
            posMap.put("veras", "ncfp000");
            posMap.put("abuelos", "ncmp000");
            posMap.put("ambages", "ncmp000");
            posMap.put("modos", "ncmp000");
            posMap.put("pedazos", "ncmp000");
            posMap.put("A", "sps00");
            posMap.put("amén", "rg");
            posMap.put("Bailando", "vmg0000");
            posMap.put("Soñando", "vmg0000");
            posMap.put("Teniendo", "vmg0000");
            posMap.put("echaremos", "vmif000");
            posMap.put("formaba", "vmii000");
            posMap.put("Formabas", "vmii000");
            posMap.put("Forman", "vmip000");
            posMap.put("perece", "vmip000");
            posMap.put("PONE", "vmip000");
            posMap.put("suicídate", "vmm0000");
            posMap.put("tardar", "vmn0000");
            posMap.put("seiscientas", "z0");
            posMap.put("trescientas", "z0");
            posMap.put("cc", "zu");
            posMap.put("km", "zu");
            posMap.put("kms", "zu");
            nUnknownWordTypes = posMap.size();
            digit = Pattern.compile("\\d+");
            participle = Pattern.compile("[ai]d[oa]$");
            actuallyNames = new HashSet(Arrays.asList("Avenida", "Contra", "Gracias", "in", "Mercado", "Jesús", "Salvo", "Van"));
            otherNamePattern = Pattern.compile("\\b(Al\\w+|A[^l]\\w*|[B-Z]\\w+)");
            otherNamePattern2 = Pattern.compile("\\b(A\\w+|[B-Z]\\w+)");
            pPronounDeterminers = Pattern.compile("(tod|otr|un)[oa]s?");
            commonPattern = Pattern.compile("^al? |^en .+ de$|sin | al?$| que$", 2);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/international/spanish/pipeline/MultiWordPreprocessor$POSTieBreaker.class */
    public static class POSTieBreaker implements Comparator<String> {
        private POSTieBreaker() {
        }

        @Override // java.util.Comparator
        public int compare(String str, String str2) {
            boolean startsWith = str.startsWith("n");
            boolean startsWith2 = str2.startsWith("n");
            if (!startsWith || startsWith2) {
                return (!startsWith2 || startsWith) ? 0 : 1;
            }
            return -1;
        }
    }

    public static void updateTagger(TwoDimensionalCounter<String, String> twoDimensionalCounter, Tree tree) {
        for (CoreLabel coreLabel : tree.taggedLabeledYield()) {
            if (!coreLabel.tag().equals(SpanishTreeNormalizer.MW_TAG)) {
                twoDimensionalCounter.incrementCount(coreLabel.word(), coreLabel.tag());
            }
        }
    }

    public static void traverseAndFix(Tree tree, Tree tree2, TwoDimensionalCounter<String, String> twoDimensionalCounter, boolean z) {
        if (tree.isPreTerminal()) {
            if (tree.value().equals(SpanishTreeNormalizer.MW_TAG)) {
                nMissingPOS++;
                String inferPOS = inferPOS(tree, tree2, twoDimensionalCounter);
                if (inferPOS != null) {
                    tree.setValue(inferPOS);
                    nFixedPOS++;
                    return;
                }
                return;
            }
            return;
        }
        for (Tree tree3 : tree.children()) {
            traverseAndFix(tree3, tree, twoDimensionalCounter, z);
        }
        if (tree.value().startsWith(SpanishTreeNormalizer.MW_PHRASE_TAG)) {
            nMissingPhrasal++;
            String inferPhrasalCategory = inferPhrasalCategory(tree, z);
            if (inferPhrasalCategory != null) {
                tree.setValue(inferPhrasalCategory);
                nFixedPhrasal++;
            }
        }
    }

    private static String getContainingPhrase(Tree tree, Tree tree2) {
        if (tree2 == null) {
            return null;
        }
        ArrayList<Label> yield = tree2.yield();
        StringBuilder sb = new StringBuilder();
        Iterator<Label> it = yield.iterator();
        while (it.hasNext()) {
            sb.append(it.next().value()).append(AddNode.ATOM_DELIMITER);
        }
        return sb.toString().substring(0, sb.length() - 1);
    }

    private static String inferPOS(Tree tree, Tree tree2, TwoDimensionalCounter<String, String> twoDimensionalCounter) {
        String value = tree.firstChild().value();
        String containingPhrase = getContainingPhrase(tree, tree2);
        String overrideTag = ManualUWModel.getOverrideTag(value, containingPhrase);
        if (overrideTag != null) {
            return overrideTag;
        }
        Set<String> firstKeySet = twoDimensionalCounter.firstKeySet();
        SpanishVerbStripper.StrippedVerb separatePronouns = verbStripper.separatePronouns(value);
        if (separatePronouns != null && firstKeySet.contains(separatePronouns.getStem())) {
            String str = (String) Counters.argmax(twoDimensionalCounter.getCounter((TwoDimensionalCounter<String, String>) separatePronouns.getStem()));
            if (str.startsWith("v")) {
                return str;
            }
        }
        return twoDimensionalCounter.firstKeySet().contains(value) ? (String) Counters.argmax(twoDimensionalCounter.getCounter((TwoDimensionalCounter<String, String>) value), new POSTieBreaker()) : ManualUWModel.getTag(value, containingPhrase);
    }

    private static String inferPhrasalCategory(Tree tree, boolean z) {
        String value = tree.value();
        String substring = value.substring(value.lastIndexOf(95) + 1);
        if (phrasalCategoryMap.containsKey(substring)) {
            return phrasalCategoryMap.get(substring);
        }
        if (substring.length() > 0 && substring.charAt(0) == 'n') {
            if (!z) {
                return "grup.nom";
            }
            switch (value.charAt(value.length() - 1)) {
                case Config.numTokens /* 48 */:
                    return "grup.nom.otros";
                case 'l':
                    return "grup.nom.lug";
                case 'o':
                    return "grup.nom.org";
                case Interval.REL_FLAGS_SE_UNKNOWN /* 112 */:
                    return "grup.nom.pers";
                default:
                    return "grup.nom";
            }
        }
        StringBuilder sb = new StringBuilder();
        for (Tree tree2 : tree.children()) {
            sb.append(tree2.value()).append(AddNode.ATOM_DELIMITER);
        }
        log.info("No phrasal cat for: " + sb.toString().trim() + " (original POS of MWE: " + substring + ")");
        return null;
    }

    private static void resolveDummyTags(File file, TwoDimensionalCounter<String, String> twoDimensionalCounter, boolean z, TreeNormalizer treeNormalizer) {
        LabeledScoredTreeFactory labeledScoredTreeFactory = new LabeledScoredTreeFactory();
        MultiWordTreeExpander multiWordTreeExpander = new MultiWordTreeExpander();
        try {
            TreeReader newTreeReader = new SpanishTreeReaderFactory().newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
            PrintWriter printWriter = new PrintWriter(new PrintStream((OutputStream) new FileOutputStream(new File(file + ".fixed")), false, "UTF-8"));
            int i = 0;
            while (true) {
                Tree readTree = newTreeReader.readTree();
                if (readTree == null) {
                    printWriter.close();
                    newTreeReader.close();
                    System.out.println("Processed " + i + " trees");
                    return;
                } else {
                    traverseAndFix(readTree, null, twoDimensionalCounter, z);
                    Tree expandPhrases = multiWordTreeExpander.expandPhrases(readTree, treeNormalizer, labeledScoredTreeFactory);
                    if (treeNormalizer != null) {
                        expandPhrases = treeNormalizer.normalizeWholeTree(expandPhrases, labeledScoredTreeFactory);
                    }
                    printWriter.println(expandPhrases.toString());
                    i++;
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
    }

    private static String usage() {
        StringBuilder sb = new StringBuilder();
        String property = System.getProperty("line.separator");
        sb.append(String.format("Usage: java %s [OPTIONS] treebank-file%n", MultiWordPreprocessor.class.getName()));
        sb.append("Options:").append(property);
        sb.append("   -help: Print this message").append(property);
        sb.append("   -ner: Retain NER information in tree constituents (pre-pre-terminal nodes)").append(property);
        sb.append("   -normalize {true, false}: Run the Spanish tree normalizer (non-aggressive) on the output of the main routine (true by default)").append(property);
        return sb.toString();
    }

    public static void main(String[] strArr) {
        Properties argsToProperties = StringUtils.argsToProperties(strArr, argOptionDefs);
        if (!argsToProperties.containsKey("") || argsToProperties.containsKey("help")) {
            log.info(usage());
            return;
        }
        boolean bool = PropertiesUtils.getBool(argsToProperties, Annotator.STANFORD_NER, false);
        boolean bool2 = PropertiesUtils.getBool(argsToProperties, "normalize", true);
        File file = new File(argsToProperties.getProperty(""));
        new TwoDimensionalCounter();
        new TwoDimensionalCounter();
        new TwoDimensionalCounter();
        new TwoDimensionalCounter();
        TwoDimensionalCounter twoDimensionalCounter = new TwoDimensionalCounter();
        try {
            TreeReader newTreeReader = new SpanishTreeReaderFactory().newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
            while (true) {
                Tree readTree = newTreeReader.readTree();
                if (readTree == null) {
                    break;
                } else {
                    updateTagger(twoDimensionalCounter, readTree);
                }
            }
            newTreeReader.close();
            System.out.println("Resolving DUMMY tags");
            resolveDummyTags(file, twoDimensionalCounter, bool, bool2 ? new SpanishTreeNormalizer(true, false, false) : null);
            System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
            System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)", Integer.valueOf(nMissingPOS), Integer.valueOf(nFixedPOS), Double.valueOf((nFixedPOS / nMissingPOS) * 100.0d)));
            System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", Integer.valueOf(nMissingPhrasal), Integer.valueOf(nFixedPhrasal), Double.valueOf((nFixedPhrasal / nMissingPhrasal) * 100.0d)));
            System.out.println("Done!");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
    }

    static {
        phrasalCategoryMap.put("ao0000", "grup.a");
        phrasalCategoryMap.put("aq0000", "grup.a");
        phrasalCategoryMap.put("aqo000", "grup.a");
        phrasalCategoryMap.put("da0000", "spec");
        phrasalCategoryMap.put("di0000", "sn");
        phrasalCategoryMap.put("dn0000", "spec");
        phrasalCategoryMap.put("dt0000", "spec");
        phrasalCategoryMap.put("i", "interjeccio");
        phrasalCategoryMap.put("i00", "interjeccio");
        phrasalCategoryMap.put("rg", "grup.adv");
        phrasalCategoryMap.put("rn", "grup.adv");
        phrasalCategoryMap.put("vaip000", "grup.verb");
        phrasalCategoryMap.put("vmg0000", "grup.verb");
        phrasalCategoryMap.put("vmic000", "grup.verb");
        phrasalCategoryMap.put("vmii000", "grup.verb");
        phrasalCategoryMap.put("vmif000", "grup.verb");
        phrasalCategoryMap.put("vmip000", "grup.verb");
        phrasalCategoryMap.put("vmis000", "grup.verb");
        phrasalCategoryMap.put("vmm0000", "grup.verb");
        phrasalCategoryMap.put("vmn0000", "grup.verb");
        phrasalCategoryMap.put("vmp0000", "grup.verb");
        phrasalCategoryMap.put("vmsi000", "grup.verb");
        phrasalCategoryMap.put("vmsp000", "grup.verb");
        phrasalCategoryMap.put("zm", "grup.nom");
        phrasalCategoryMap.put("cc", "grup.cc");
        phrasalCategoryMap.put("cs", "grup.cs");
        phrasalCategoryMap.put("pn000000", "grup.nom");
        phrasalCategoryMap.put("pi000000", "grup.pron");
        phrasalCategoryMap.put("pr000000", "grup.pron");
        phrasalCategoryMap.put("pt000000", "grup.pron");
        phrasalCategoryMap.put("px000000", "grup.pron");
        phrasalCategoryMap.put("sp000", "grup.prep");
        phrasalCategoryMap.put("w", "grup.w");
        phrasalCategoryMap.put("z", "grup.z");
        phrasalCategoryMap.put("z0", "grup.z");
        phrasalCategoryMap.put("zp", "grup.z");
        phrasalCategoryMap.put("zu", "grup.z");
        verbStripper = SpanishVerbStripper.getInstance();
        argOptionDefs = Generics.newHashMap();
        argOptionDefs.put("help", 0);
        argOptionDefs.put(Annotator.STANFORD_NER, 0);
        argOptionDefs.put("normalize", 1);
    }
}
