package ai.djl.modality.nlp.bert;

import ai.djl.modality.nlp.NlpUtils;
import ai.djl.modality.nlp.Vocabulary;
import ai.djl.modality.nlp.preprocess.LambdaProcessor;
import ai.djl.modality.nlp.preprocess.LowerCaseConvertor;
import ai.djl.modality.nlp.preprocess.PunctuationSeparator;
import ai.djl.modality.nlp.preprocess.SimpleTokenizer;
import ai.djl.modality.nlp.preprocess.TextCleaner;
import ai.djl.modality.nlp.preprocess.TextProcessor;
import ai.djl.modality.nlp.preprocess.UnicodeNormalizer;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

/* loaded from: input_file:ai/djl/modality/nlp/bert/BertFullTokenizer.class */
public class BertFullTokenizer extends BertTokenizer {
    private Vocabulary vocabulary;
    private List<TextProcessor> basicBertPreprocessors;
    private WordpieceTokenizer wordpieceTokenizer;

    public BertFullTokenizer(Vocabulary vocabulary, boolean z) {
        this.vocabulary = vocabulary;
        this.basicBertPreprocessors = getPreprocessors(z);
        this.wordpieceTokenizer = new WordpieceTokenizer(vocabulary, "[UNK]", 200);
    }

    public Vocabulary getVocabulary() {
        return this.vocabulary;
    }

    /* JADX WARN: Multi-variable type inference failed */
    @Override // ai.djl.modality.nlp.bert.BertTokenizer, ai.djl.modality.nlp.preprocess.SimpleTokenizer, ai.djl.modality.nlp.preprocess.Tokenizer
    public List<String> tokenize(String str) {
        List arrayList = new ArrayList(Collections.singletonList(str));
        Iterator<TextProcessor> it = this.basicBertPreprocessors.iterator();
        while (it.hasNext()) {
            arrayList = it.next().preprocess(arrayList);
        }
        return this.wordpieceTokenizer.preprocess(arrayList);
    }

    @Override // ai.djl.modality.nlp.preprocess.SimpleTokenizer, ai.djl.modality.nlp.preprocess.Tokenizer
    public String buildSentence(List<String> list) {
        return String.join(" ", list).replace(" ##", "").trim();
    }

    public static List<TextProcessor> getPreprocessors(boolean z) {
        ArrayList arrayList = new ArrayList(10);
        arrayList.add(new TextCleaner(ch -> {
            return Boolean.valueOf(ch.charValue() == 0 || ch.charValue() == 65533 || NlpUtils.isControl(ch.charValue()));
        }, (char) 0));
        arrayList.add(new TextCleaner((v0) -> {
            return NlpUtils.isWhiteSpace(v0);
        }, ' '));
        arrayList.add(new LambdaProcessor((v0) -> {
            return v0.trim();
        }));
        arrayList.add(new SimpleTokenizer());
        if (z) {
            arrayList.add(new LowerCaseConvertor());
        }
        arrayList.add(new UnicodeNormalizer(Normalizer.Form.NFD));
        arrayList.add(new TextCleaner(ch2 -> {
            return Boolean.valueOf(Character.getType(ch2.charValue()) == 6);
        }, (char) 0));
        arrayList.add(new PunctuationSeparator());
        arrayList.add(new LambdaProcessor((v0) -> {
            return v0.trim();
        }));
        return arrayList;
    }
}
