package org.languagetool.tokenizers.fr;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.fr.FrenchTagger;
import org.languagetool.tokenizers.WordTokenizer;

/* loaded from: input_file:org/languagetool/tokenizers/fr/FrenchWordTokenizer.class */
public class FrenchWordTokenizer extends WordTokenizer {
    private static final int maxPatterns = 7;
    private static final Pattern TYPEWRITER_APOSTROPHE = Pattern.compile("([\\p{L}])'([\\p{L}1\"‘“«])", 66);
    private static final Pattern TYPOGRAPHIC_APOSTROPHE = Pattern.compile("([\\p{L}])’([\\p{L}1\"‘“«])", 66);
    private static final Pattern NEARBY_HYPHENS = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", 66);
    private static final Pattern HYPHENS = Pattern.compile("([\\p{L}])-([\\p{L}\\d])", 66);
    private static final Pattern DECIMAL_POINT = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final Pattern DECIMAL_COMMA = Pattern.compile("([\\d]),([\\d])", 66);
    private static final Pattern SPACE_DIGITS0 = Pattern.compile("([\\d]{4}) ", 66);
    private static final Pattern SPACE_DIGITS = Pattern.compile("([\\d]) ([\\d][\\d][\\d])\\b", 66);
    private static final Pattern SPACE_DIGITS2 = Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])\\b", 66);
    private static final List<String> doNotSplit = Arrays.asList("mers-cov", "mcgraw-hill", "sars-cov-2", "sars-cov", "ph-metre", "ph-metres", "anti-ivg", "anti-uv", "anti-vih", "al-qaïda", "c'est-à-dire", "add-on", "add-ons", "rendez-vous", "garde-à-vous", "chez-eux", "chez-moi", "chez-nous", "chez-soi", "chez-toi", "chez-vous", "m'as-tu-vu");
    private final Pattern[] patterns = new Pattern[maxPatterns];
    private final String frTokenizingChars = super.getTokenizingCharacters() + "-";

    public FrenchWordTokenizer() {
        this.patterns[0] = Pattern.compile("^(c['’]te?|m['’]as-tu-vu|c['’]est-à-dire|add-on|add-ons|rendez-vous|garde-à-vous|chez-eux|chez-moi|chez-nous|chez-soi|chez-toi|chez-vous)$", 66);
        this.patterns[1] = Pattern.compile("^([cç]['’]|j['’]|n['’]|m['’]|t['’]|s['’]|l['’]|d['’]|qu['’]|jusqu['’]|lorsqu['’]|puisqu['’]|quoiqu['’])([^\\-]*)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
        this.patterns[2] = Pattern.compile("^([cç]['’]|j['’]|n['’]|m['’]|t['’]|s['’]|l['’]|d['’]|qu['’]|jusqu['’]|lorsqu['’]|puisqu['’]|quoiqu['’])([^'’\\-].*)$", 66);
        this.patterns[3] = Pattern.compile("^([^\\-]*)(-ce|-t-elle|-t-elles|-elle|-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
        this.patterns[4] = Pattern.compile("^([^\\-]*)(-t|-m)(['’]en|['’]y)$", 66);
        this.patterns[5] = Pattern.compile("^(.*)(-t-elle|-t-elles|-t-il|-t-ils|-t-on)$", 66);
        this.patterns[6] = Pattern.compile("^(.*)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
    }

    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = SPACE_DIGITS.matcher(SPACE_DIGITS0.matcher(SPACE_DIGITS2.matcher(DECIMAL_COMMA.matcher(DECIMAL_POINT.matcher(HYPHENS.matcher(NEARBY_HYPHENS.matcher(TYPOGRAPHIC_APOSTROPHE.matcher(TYPEWRITER_APOSTROPHE.matcher(str).replaceAll("$1\u0001\u0001FR_APOS_TYPEW\u0001\u0001$2")).replaceAll("$1\u0001\u0001FR_APOS_TYPOG\u0001\u0001$2")).replaceAll("$1\u0001\u0001FR_HYPHEN\u0001\u0001$2\u0001\u0001FR_HYPHEN\u0001\u0001$3")).replaceAll("$1\u0001\u0001FR_HYPHEN\u0001\u0001$2")).replaceAll("$1\u0001\u0001FR_DECIMALPOINT\u0001\u0001$2")).replaceAll("$1\u0001\u0001FR_DECIMALCOMMA\u0001\u0001$2")).replaceAll("$1\u0001\u0001FR_SPACE\u0001\u0001$2\u0001\u0001FR_SPACE\u0001\u0001$3")).replaceAll("$1\u0001\u0001FR_SPACE0\u0001\u0001"));
        StringTokenizer stringTokenizer = new StringTokenizer(matcher.replaceAll("$1\u0001\u0001FR_SPACE\u0001\u0001$2").replaceAll("\\u0001\\u0001FR_SPACE0\\u0001\\u0001", " "), this.frTokenizingChars, true);
        while (stringTokenizer.hasMoreElements()) {
            String replace = stringTokenizer.nextToken().replace("\u0001\u0001FR_APOS_TYPEW\u0001\u0001", "'").replace("\u0001\u0001FR_APOS_TYPOG\u0001\u0001", "’").replace("\u0001\u0001FR_HYPHEN\u0001\u0001", "-").replace("\u0001\u0001FR_DECIMALPOINT\u0001\u0001", ".").replace("\u0001\u0001FR_DECIMALCOMMA\u0001\u0001", ",").replace("\u0001\u0001FR_SPACE\u0001\u0001", " ");
            boolean z = false;
            while (replace.length() > 1 && replace.startsWith("-")) {
                arrayList.add("-");
                replace = replace.substring(1);
            }
            int i = 0;
            while (replace.length() > 1 && replace.endsWith("-")) {
                replace = replace.substring(0, replace.length() - 1);
                i++;
            }
            for (int i2 = 0; i2 < maxPatterns && !z; i2++) {
                matcher = this.patterns[i2].matcher(replace);
                z = matcher.find();
            }
            if (z) {
                for (int i3 = 1; i3 <= matcher.groupCount(); i3++) {
                    arrayList.addAll(wordsToAdd(matcher.group(i3)));
                }
            } else {
                arrayList.addAll(wordsToAdd(replace));
            }
            while (i > 0) {
                arrayList.add("-");
                i--;
            }
        }
        return joinEMailsAndUrls(arrayList);
    }

    private List<String> wordsToAdd(String str) {
        ArrayList arrayList = new ArrayList();
        synchronized (this) {
            if (!str.isEmpty()) {
                if (!str.contains("-")) {
                    arrayList.add(str);
                } else if (FrenchTagger.INSTANCE.tag(Arrays.asList(str.replaceAll("\u00ad", "").replace("’", "'"))).get(0).isTagged()) {
                    arrayList.add(str);
                } else if (doNotSplit.contains(str.toLowerCase())) {
                    arrayList.add(str);
                } else {
                    StringTokenizer stringTokenizer = new StringTokenizer(str, "-", true);
                    while (stringTokenizer.hasMoreElements()) {
                        arrayList.add(stringTokenizer.nextToken());
                    }
                }
            }
        }
        return arrayList;
    }
}
