package edu.stanford.nlp.international.arabic.pipeline;

import edu.stanford.nlp.ie.pascal.ISODateInstance;
import edu.stanford.nlp.international.arabic.Buckwalter;
import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;
import java.io.File;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/international/arabic/pipeline/DefaultLexicalMapper.class */
public class DefaultLexicalMapper implements Mapper, Serializable {
    private static final long serialVersionUID = -3798804368296999785L;
    private final Pattern utf8ArabicChart = Pattern.compile("[\u0600-ۿ]");
    private final String bwAlefChar = ISODateInstance.OPEN_RANGE_AFTER;
    private final Pattern bwDiacritics = Pattern.compile("F|N|K|a|u|i|\\~|o");
    private final Pattern bwTatweel = Pattern.compile(Expressions.VAR_SELF);
    private final Pattern bwAlef = Pattern.compile("\\{|\\||>|<");
    private final Pattern bwQuran = Pattern.compile("`");
    private final Pattern bwNullAnaphoraMarker = Pattern.compile("\\[nll\\]");
    public final Pattern latinPunc = Pattern.compile("([!-/:-@\\u005B-`{-~¡-¿÷‐-‧‰-⁞₠-₺])+");
    public final Pattern arabicPunc = Pattern.compile("([«»؉-؍؛-؟٪٬-٭۔])+");
    public final Pattern arabicDigit = Pattern.compile("([۰-۹٠-٩])+");
    private final Pattern utf8Diacritics = Pattern.compile("َ|ً|ُ|ٌ|ِ|ٍ|ّ|ْ|ٰ");
    private final Pattern utf8Tatweel = Pattern.compile("ـ");
    private final Pattern utf8Alef = Pattern.compile("ا|إ|أ|آ|ٱ");
    private final Pattern utf8Quran = Pattern.compile("[ؕ-ؚۖ-ۥ]");
    private final Pattern utf8ProDrop = Pattern.compile("\\[نلل\\]");
    public final Pattern segmentationMarker = Pattern.compile("^-+|-+$");
    private final Pattern morphemeBoundary = Pattern.compile("\\+");
    private final Pattern hasDigit = Pattern.compile("\\d+");
    private boolean useATBVocalizedSectionMapping = false;
    private boolean stripMorphemeMarkersInUTF8 = false;
    private boolean stripSegmentationMarkersInUTF8 = false;
    private final String parentTagString = "PUNC LATIN -NONE-";
    private final String utf8CliticString = "ل ف و ما ه ها هم هن نا كم تن تم ى ي هما ك ب م";
    private final Set<String> parentTagsToEscape = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList("PUNC LATIN -NONE-".split("\\s+"))));
    private final Set<String> bwClitics = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(new Buckwalter(true).apply("ل ف و ما ه ها هم هن نا كم تن تم ى ي هما ك ب م").split("\\s+"))));

    private String mapUtf8(String str) {
        Matcher matcher = this.latinPunc.matcher(str);
        Matcher matcher2 = this.arabicPunc.matcher(str);
        if (matcher.matches() || matcher2.matches()) {
            return str;
        }
        String replaceAll = this.utf8Diacritics.matcher(str).replaceAll("");
        if (replaceAll.length() > 1) {
            replaceAll = this.utf8Tatweel.matcher(replaceAll).replaceAll("");
        }
        String replaceAll2 = this.utf8ProDrop.matcher(this.utf8Quran.matcher(this.utf8Alef.matcher(replaceAll).replaceAll("ا")).replaceAll("")).replaceAll("");
        if (this.stripMorphemeMarkersInUTF8) {
            String replaceAll3 = this.morphemeBoundary.matcher(replaceAll2).replaceAll("");
            if (replaceAll3.length() > 0) {
                replaceAll2 = replaceAll3;
            }
        }
        if (this.stripSegmentationMarkersInUTF8) {
            String replaceAll4 = this.segmentationMarker.matcher(replaceAll2).replaceAll("");
            if (replaceAll4.length() > 0) {
                replaceAll2 = replaceAll4;
            }
        }
        return replaceAll2;
    }

    private String mapBuckwalter(String str) {
        if (this.latinPunc.matcher(str).matches()) {
            return str;
        }
        String replaceAll = this.bwDiacritics.matcher(str).replaceAll("");
        if (replaceAll.length() > 1) {
            replaceAll = this.bwTatweel.matcher(replaceAll).replaceAll("");
        }
        String replaceAll2 = this.bwNullAnaphoraMarker.matcher(this.bwQuran.matcher(this.bwAlef.matcher(replaceAll).replaceAll(ISODateInstance.OPEN_RANGE_AFTER)).replaceAll("")).replaceAll("");
        if (this.useATBVocalizedSectionMapping && replaceAll2.length() > 1) {
            replaceAll2 = this.morphemeBoundary.matcher(replaceAll2).replaceAll("");
            Matcher matcher = this.segmentationMarker.matcher(replaceAll2);
            if (matcher.find() && !this.hasDigit.matcher(replaceAll2).find()) {
                String replaceAll3 = matcher.replaceAll("");
                if (replaceAll3.length() > 0) {
                    replaceAll2 = this.bwClitics.contains(replaceAll3) ? replaceAll2 : replaceAll3;
                }
            }
        } else if (replaceAll2.length() > 1 && !ATBTreeUtils.reservedWords.contains(replaceAll2)) {
            replaceAll2 = this.segmentationMarker.matcher(replaceAll2).replaceAll("");
        }
        return replaceAll2;
    }

    @Override // edu.stanford.nlp.trees.treebank.Mapper
    public String map(String str, String str2) {
        String trim = str2.trim();
        return (str == null || !this.parentTagsToEscape.contains(str)) ? this.utf8ArabicChart.matcher(trim).find() ? mapUtf8(trim) : mapBuckwalter(trim) : trim;
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:9:0x0027. Please report as an issue. */
    @Override // edu.stanford.nlp.trees.treebank.Mapper
    public void setup(File file, String... strArr) {
        if (strArr == null) {
            return;
        }
        for (String str : strArr) {
            boolean z = -1;
            switch (str.hashCode()) {
                case -1596542417:
                    if (str.equals("ATBVocalizedSection")) {
                        z = false;
                        break;
                    }
                    break;
                case 415746002:
                    if (str.equals("StripSegMarkersInUTF8")) {
                        z = true;
                        break;
                    }
                    break;
                case 1359505407:
                    if (str.equals("StripMorphMarkersInUTF8")) {
                        z = 2;
                        break;
                    }
                    break;
            }
            switch (z) {
                case false:
                    this.useATBVocalizedSectionMapping = true;
                    break;
                case true:
                    this.stripSegmentationMarkersInUTF8 = true;
                    break;
                case true:
                    this.stripMorphemeMarkersInUTF8 = true;
                    break;
            }
        }
    }

    @Override // edu.stanford.nlp.trees.treebank.Mapper
    public boolean canChangeEncoding(String str, String str2) {
        String trim = str.trim();
        String trim2 = str2.trim();
        if (trim.contains("NUMERIC_COMMA")) {
            return true;
        }
        if (trim.contains(ATBTreeUtils.puncTag) && trim2.equals("r")) {
            return true;
        }
        return (this.hasDigit.matcher(trim2).find() || this.parentTagsToEscape.contains(trim)) ? false : true;
    }

    public static void main(String[] strArr) {
        System.out.printf("< :-> %s\n", new DefaultLexicalMapper().map(null, "FNKqq"));
    }
}
