package edu.stanford.nlp.international.spanish.process;

import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.tokensregex.SequenceMatchRules;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.AddNode;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/international/spanish/process/SpanishTokenizer.class */
public class SpanishTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
    private final SpanishLexer lexer;
    private final boolean splitCompounds;
    private final boolean splitVerbs;
    private final boolean splitContractions;
    private final boolean splitAny;
    private List<CoreLabel> compoundBuffer;
    private SpanishVerbStripper verbStripper;
    public static final String ANCORA_OPTIONS = "ellipses=ascii,normalizeParentheses=true,splitAll=true";
    public static final String DEFAULT_OPTIONS = "invertible,ellipses=ascii,splitAll=false";
    private static final Redwood.RedwoodChannels log = Redwood.channels(SpanishTokenizer.class);
    private static final Pattern pDash = Pattern.compile("-");
    private static final Pattern pSpace = Pattern.compile("\\s+");

    /* loaded from: input_file:edu/stanford/nlp/international/spanish/process/SpanishTokenizer$SpanishTokenizerFactory.class */
    public static class SpanishTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> {
        private static final long serialVersionUID = 946818805507187330L;
        protected final LexedTokenFactory<T> factory;
        protected Properties lexerProperties;
        protected boolean splitCompoundOption;
        protected boolean splitVerbOption;
        protected boolean splitContractionOption;

        public static TokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() {
            return new SpanishTokenizerFactory(new CoreLabelTokenFactory(), SpanishTokenizer.DEFAULT_OPTIONS);
        }

        public static <T extends HasWord> SpanishTokenizerFactory<T> newSpanishTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory, String str) {
            return new SpanishTokenizerFactory<>(lexedTokenFactory, str);
        }

        private SpanishTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory) {
            this.lexerProperties = new Properties();
            this.splitCompoundOption = false;
            this.splitVerbOption = false;
            this.splitContractionOption = false;
            this.factory = lexedTokenFactory;
        }

        private SpanishTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory, String str) {
            this.lexerProperties = new Properties();
            this.splitCompoundOption = false;
            this.splitVerbOption = false;
            this.splitContractionOption = false;
            this.factory = lexedTokenFactory;
            setOptions(str);
        }

        @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
        public Iterator<T> getIterator(Reader reader) {
            return getTokenizer(reader);
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader) {
            return new SpanishTokenizer(reader, this.factory, this.lexerProperties, this.splitCompoundOption, this.splitVerbOption, this.splitContractionOption);
        }

        /* JADX WARN: Can't fix incorrect switch cases order, some code will duplicate */
        /* JADX WARN: Code restructure failed: missing block: B:54:0x017f, code lost:
        
            switch(r16) {
                case 0: goto L50;
                case 1: goto L51;
                case 2: goto L52;
                case 3: goto L53;
                default: goto L54;
            };
         */
        /* JADX WARN: Code restructure failed: missing block: B:55:0x019c, code lost:
        
            r7.splitCompoundOption = java.lang.Boolean.parseBoolean(r0[1]);
            r7.splitVerbOption = java.lang.Boolean.parseBoolean(r0[1]);
            r7.splitContractionOption = java.lang.Boolean.parseBoolean(r0[1]);
         */
        /* JADX WARN: Code restructure failed: missing block: B:57:0x01c0, code lost:
        
            r7.splitCompoundOption = java.lang.Boolean.parseBoolean(r0[1]);
         */
        /* JADX WARN: Code restructure failed: missing block: B:59:0x01ce, code lost:
        
            r7.splitVerbOption = java.lang.Boolean.parseBoolean(r0[1]);
         */
        /* JADX WARN: Code restructure failed: missing block: B:61:0x01dc, code lost:
        
            r7.splitContractionOption = java.lang.Boolean.parseBoolean(r0[1]);
         */
        /* JADX WARN: Code restructure failed: missing block: B:63:0x01ea, code lost:
        
            r7.lexerProperties.setProperty(r0[0], r0[1]);
         */
        /* JADX WARN: Failed to find 'out' block for switch in B:11:0x0040. Please report as an issue. */
        @Override // edu.stanford.nlp.process.TokenizerFactory
        /*
            Code decompiled incorrectly, please refer to instructions dump.
            To view partially-correct add '--show-bad-code' argument
        */
        public void setOptions(java.lang.String r8) {
            /*
                Method dump skipped, instructions count: 544
                To view this dump add '--comments-level debug' option
            */
            throw new UnsupportedOperationException("Method not decompiled: edu.stanford.nlp.international.spanish.process.SpanishTokenizer.SpanishTokenizerFactory.setOptions(java.lang.String):void");
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader, String str) {
            setOptions(str);
            return getTokenizer(reader);
        }
    }

    public SpanishTokenizer(Reader reader, LexedTokenFactory<T> lexedTokenFactory, Properties properties, boolean z, boolean z2, boolean z3) {
        this.lexer = new SpanishLexer(reader, lexedTokenFactory, properties);
        this.splitCompounds = z;
        this.splitVerbs = z2;
        this.splitContractions = z3;
        this.splitAny = z || z2 || z3;
        if (this.splitAny) {
            this.compoundBuffer = Generics.newArrayList(4);
        }
        if (z2) {
            this.verbStripper = SpanishVerbStripper.getInstance();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // edu.stanford.nlp.process.AbstractTokenizer
    public T getNext() {
        HasWord remove;
        do {
            try {
                remove = (!this.splitAny || this.compoundBuffer.isEmpty()) ? (HasWord) this.lexer.next() : this.compoundBuffer.remove(0);
                if (remove == null) {
                    break;
                }
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        } while (remove.word().isEmpty());
        if (this.splitAny && (remove instanceof CoreLabel)) {
            CoreLabel coreLabel = (CoreLabel) remove;
            if (coreLabel.containsKey(CoreAnnotations.ParentAnnotation.class)) {
                if (this.splitCompounds && ((String) coreLabel.get(CoreAnnotations.ParentAnnotation.class)).equals("comp")) {
                    remove = processCompound(coreLabel);
                } else if (this.splitVerbs && ((String) coreLabel.get(CoreAnnotations.ParentAnnotation.class)).equals(SpanishLexer.VB_PRON_ANNOTATION)) {
                    remove = processVerb(coreLabel);
                } else if (this.splitContractions && ((String) coreLabel.get(CoreAnnotations.ParentAnnotation.class)).equals("contraction")) {
                    remove = processContraction(coreLabel);
                }
            }
        }
        return (T) remove;
    }

    private static CoreLabel copyCoreLabel(CoreLabel coreLabel, String str, int i, int i2) {
        CoreLabel coreLabel2 = new CoreLabel(coreLabel);
        coreLabel2.setWord(str);
        coreLabel2.setValue(str);
        coreLabel2.setBeginPosition(i);
        coreLabel2.setEndPosition(i2);
        coreLabel2.set(CoreAnnotations.OriginalTextAnnotation.class, str);
        return coreLabel2;
    }

    private static CoreLabel copyCoreLabel(CoreLabel coreLabel, String str, int i) {
        return copyCoreLabel(coreLabel, str, i, i + str.length());
    }

    private CoreLabel processContraction(CoreLabel coreLabel) {
        String substring;
        String substring2;
        int i;
        int i2;
        coreLabel.remove(CoreAnnotations.ParentAnnotation.class);
        String word = coreLabel.word();
        String lowerCase = word.toLowerCase();
        boolean z = -1;
        switch (lowerCase.hashCode()) {
            case 3115:
                if (lowerCase.equals("al")) {
                    z = true;
                    break;
                }
                break;
            case 99339:
                if (lowerCase.equals("del")) {
                    z = false;
                    break;
                }
                break;
            case 951325702:
                if (lowerCase.equals("conmigo")) {
                    z = 2;
                    break;
                }
                break;
            case 951504448:
                if (lowerCase.equals("consigo")) {
                    z = 3;
                    break;
                }
                break;
            case 951534239:
                if (lowerCase.equals("contigo")) {
                    z = 4;
                    break;
                }
                break;
        }
        switch (z) {
            case false:
            case true:
                substring = word.substring(0, lowerCase.length() - 1);
                substring2 = Character.isLowerCase(word.charAt(lowerCase.length() - 1)) ? "el" : "EL";
                i = 1;
                i2 = lowerCase.length() - 1;
                break;
            case true:
            case true:
                substring = word.substring(0, 3);
                substring2 = word.charAt(3) + "í";
                i = 3;
                i2 = 4;
                break;
            case true:
                substring = word.substring(0, 3);
                substring2 = word.substring(3, 5);
                i = 3;
                i2 = 4;
                break;
            default:
                throw new IllegalArgumentException("Invalid contraction provided to processContraction");
        }
        int beginPosition = coreLabel.beginPosition() + i;
        this.compoundBuffer.add(copyCoreLabel(coreLabel, substring2, beginPosition, beginPosition + i2));
        return copyCoreLabel(coreLabel, substring, coreLabel.beginPosition(), beginPosition);
    }

    private CoreLabel processVerb(CoreLabel coreLabel) {
        coreLabel.remove(CoreAnnotations.ParentAnnotation.class);
        SpanishVerbStripper.StrippedVerb separatePronouns = this.verbStripper.separatePronouns(coreLabel.word());
        if (separatePronouns == null) {
            return coreLabel;
        }
        int beginPosition = coreLabel.beginPosition() + separatePronouns.getOriginalStem().length();
        int i = 0;
        for (String str : separatePronouns.getPronouns()) {
            this.compoundBuffer.add(copyCoreLabel(coreLabel, str, beginPosition + i));
            i += str.length();
        }
        CoreLabel copyCoreLabel = copyCoreLabel(coreLabel, separatePronouns.getStem(), coreLabel.beginPosition(), beginPosition);
        copyCoreLabel.setOriginalText(separatePronouns.getOriginalStem());
        return copyCoreLabel;
    }

    private CoreLabel processCompound(CoreLabel coreLabel) {
        coreLabel.remove(CoreAnnotations.ParentAnnotation.class);
        int i = 0;
        for (String str : pSpace.split(pDash.matcher(coreLabel.word()).replaceAll(" - "))) {
            CoreLabel coreLabel2 = new CoreLabel(coreLabel);
            coreLabel2.setWord(str);
            coreLabel2.setValue(str);
            coreLabel2.setBeginPosition(coreLabel.beginPosition() + i);
            coreLabel2.setEndPosition(coreLabel.beginPosition() + i + str.length());
            coreLabel2.set(CoreAnnotations.OriginalTextAnnotation.class, str);
            this.compoundBuffer.add(coreLabel2);
            i += str.length();
        }
        return this.compoundBuffer.remove(0);
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> lexedTokenFactory, String str) {
        return new SpanishTokenizerFactory(lexedTokenFactory, str);
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> lexedTokenFactory) {
        return new SpanishTokenizerFactory(lexedTokenFactory, DEFAULT_OPTIONS);
    }

    public static TokenizerFactory<CoreLabel> ancoraFactory() {
        TokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory = SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
        newCoreLabelTokenizerFactory.setOptions(ANCORA_OPTIONS);
        return newCoreLabelTokenizerFactory;
    }

    public static TokenizerFactory<CoreLabel> coreLabelFactory() {
        return SpanishTokenizerFactory.newCoreLabelTokenizerFactory();
    }

    public static TokenizerFactory<CoreLabel> factory() {
        return coreLabelFactory();
    }

    private static String usage() {
        StringBuilder sb = new StringBuilder();
        String lineSeparator = System.lineSeparator();
        sb.append(String.format("Usage: java %s [OPTIONS] < file%n%n", SpanishTokenizer.class.getName()));
        sb.append("Options:").append(lineSeparator);
        sb.append("   -help          : Print this message.").append(lineSeparator);
        sb.append("   -ancora        : Tokenization style of AnCora (fixed).").append(lineSeparator);
        sb.append("   -lowerCase     : Apply lowercasing.").append(lineSeparator);
        sb.append("   -encoding type : Encoding format.").append(lineSeparator);
        sb.append("   -options str   : Orthographic options (see SpanishLexer.java)").append(lineSeparator);
        sb.append("   -tokens        : Output tokens as line-separated instead of space-separated.").append(lineSeparator);
        sb.append("   -onePerLine    : Output tokens one per line.").append(lineSeparator);
        return sb.toString();
    }

    private static Map<String, Integer> argOptionDefs() {
        Map<String, Integer> newHashMap = Generics.newHashMap();
        newHashMap.put("help", 0);
        newHashMap.put("ftb", 0);
        newHashMap.put("ancora", 0);
        newHashMap.put("lowerCase", 0);
        newHashMap.put("encoding", 1);
        newHashMap.put("options", 1);
        newHashMap.put(SequenceMatchRules.TOKEN_PATTERN_RULE_TYPE, 0);
        return newHashMap;
    }

    public static void main(String[] strArr) {
        Properties argsToProperties = StringUtils.argsToProperties(strArr, argOptionDefs());
        if (argsToProperties.containsKey("help")) {
            log.info(usage());
            return;
        }
        TokenizerFactory<CoreLabel> coreLabelFactory = coreLabelFactory();
        String str = argsToProperties.containsKey("ancora") ? ANCORA_OPTIONS : "";
        if (argsToProperties.containsKey("options")) {
            str = str.isEmpty() ? argsToProperties.getProperty("options") : str + ',' + argsToProperties;
        }
        if (!PropertiesUtils.getBool(argsToProperties, SequenceMatchRules.TOKEN_PATTERN_RULE_TYPE, false)) {
            str = str.isEmpty() ? "tokenizeNLs" : str + ",tokenizeNLs";
        }
        coreLabelFactory.setOptions(str);
        String property = argsToProperties.getProperty("encoding", "UTF-8");
        boolean bool = PropertiesUtils.getBool(argsToProperties, "lowerCase", false);
        Locale locale = new Locale("es");
        boolean bool2 = PropertiesUtils.getBool(argsToProperties, "onePerLine", false);
        int i = 0;
        int i2 = 0;
        long nanoTime = System.nanoTime();
        try {
            Tokenizer<CoreLabel> tokenizer = coreLabelFactory.getTokenizer(new BufferedReader(new InputStreamReader(System.in, property)));
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(System.out, property));
            boolean z = false;
            while (tokenizer.hasNext()) {
                i2++;
                String word = tokenizer.next().word();
                if (word.equals("*NL*")) {
                    i++;
                    if (!bool2) {
                        bufferedWriter.newLine();
                        z = false;
                    }
                } else {
                    String lowerCase = bool ? word.toLowerCase(locale) : word;
                    if (bool2) {
                        bufferedWriter.write(lowerCase);
                        bufferedWriter.newLine();
                    } else {
                        if (z) {
                            bufferedWriter.write(AddNode.ATOM_DELIMITER);
                        }
                        bufferedWriter.write(lowerCase);
                        z = true;
                    }
                }
            }
            System.err.printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", Integer.valueOf(i), Integer.valueOf(i2), Double.valueOf(i / ((System.nanoTime() - nanoTime) / 1.0E9d)));
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeIOException("Bad character encoding", e);
        } catch (IOException e2) {
            throw new RuntimeIOException(e2);
        }
    }
}
