package com.aliasi.chunk;

import com.aliasi.corpus.ObjectHandler;
import com.aliasi.hmm.AbstractHmmEstimator;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.symbol.SymbolTable;
import com.aliasi.tag.Tagging;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.Strings;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

/* loaded from: input_file:lib/lingpipe-4.1.0.jar:com/aliasi/chunk/CharLmHmmChunker.class */
public class CharLmHmmChunker extends HmmChunker implements ObjectHandler<Chunking>, Compilable {
    private final boolean mValidateTokenizer = false;
    private final AbstractHmmEstimator mHmmEstimator;
    private final TokenizerFactory mTokenizerFactory;
    private final Set<String> mTagSet;
    private final boolean mSmoothTags;
    static final Chunk[] EMPTY_CHUNK_ARRAY = new Chunk[0];

    /* loaded from: input_file:lib/lingpipe-4.1.0.jar:com/aliasi/chunk/CharLmHmmChunker$Externalizer.class */
    static class Externalizer extends AbstractExternalizable {
        private static final long serialVersionUID = 4630707998932521821L;
        final CharLmHmmChunker mChunker;

        public Externalizer() {
            this(null);
        }

        public Externalizer(CharLmHmmChunker charLmHmmChunker) {
            this.mChunker = charLmHmmChunker;
        }

        @Override // com.aliasi.util.AbstractExternalizable
        public Object read(ObjectInput objectInput) throws ClassNotFoundException, IOException {
            return new HmmChunker((TokenizerFactory) objectInput.readObject(), new HmmDecoder((HiddenMarkovModel) objectInput.readObject()));
        }

        @Override // com.aliasi.util.AbstractExternalizable, java.io.Externalizable
        public void writeExternal(ObjectOutput objectOutput) throws IOException {
            AbstractExternalizable.compileOrSerialize(this.mChunker.getTokenizerFactory(), objectOutput);
            AbstractExternalizable.compileOrSerialize(this.mChunker.getHmmEstimator(), objectOutput);
        }
    }

    public CharLmHmmChunker(TokenizerFactory tokenizerFactory, AbstractHmmEstimator abstractHmmEstimator) {
        this(tokenizerFactory, abstractHmmEstimator, false);
    }

    public CharLmHmmChunker(TokenizerFactory tokenizerFactory, AbstractHmmEstimator abstractHmmEstimator, boolean z) {
        super(tokenizerFactory, new HmmDecoder(abstractHmmEstimator));
        this.mValidateTokenizer = false;
        this.mTagSet = new HashSet();
        this.mHmmEstimator = abstractHmmEstimator;
        this.mTokenizerFactory = tokenizerFactory;
        this.mSmoothTags = z;
        smoothBoundaries();
    }

    public AbstractHmmEstimator getHmmEstimator() {
        return this.mHmmEstimator;
    }

    @Override // com.aliasi.chunk.HmmChunker
    public TokenizerFactory getTokenizerFactory() {
        return this.mTokenizerFactory;
    }

    public void trainDictionary(CharSequence charSequence, String str) {
        char[] charArray = Strings.toCharArray(charSequence);
        String[] strArr = getTokenizerFactory().tokenizer(charArray, 0, charArray.length).tokenize();
        if (strArr.length < 1) {
            throw new IllegalArgumentException("Did not find any tokens in entry.Char sequence=" + ((Object) charSequence));
        }
        AbstractHmmEstimator hmmEstimator = getHmmEstimator();
        smoothBaseTag(str, hmmEstimator.stateSymbolTable(), hmmEstimator);
        if (strArr.length == 1) {
            hmmEstimator.trainEmit("W_" + str, strArr[0]);
            return;
        }
        String str2 = BioTagChunkCodec.BEGIN_TAG_PREFIX + str;
        hmmEstimator.trainEmit(str2, strArr[0]);
        String str3 = str2;
        for (int i = 1; i + 1 < strArr.length; i++) {
            String str4 = "M_" + str;
            hmmEstimator.trainEmit(str4, strArr[i]);
            hmmEstimator.trainTransit(str3, str4);
            str3 = str4;
        }
        String str5 = "E_" + str;
        hmmEstimator.trainEmit(str5, strArr[strArr.length - 1]);
        hmmEstimator.trainTransit(str3, str5);
    }

    @Override // com.aliasi.corpus.ObjectHandler
    public void handle(Chunking chunking) {
        CharSequence charSequence = chunking.charSequence();
        char[] charArray = Strings.toCharArray(charSequence);
        Chunk[] chunkArr = (Chunk[]) chunking.chunkSet().toArray(EMPTY_CHUNK_ARRAY);
        Arrays.sort(chunkArr, Chunk.TEXT_ORDER_COMPARATOR);
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        ArrayList arrayList3 = new ArrayList();
        int i = 0;
        for (Chunk chunk : chunkArr) {
            String type = chunk.type();
            int start = chunk.start();
            int end = chunk.end();
            outTag(charArray, i, start, arrayList, arrayList2, arrayList3, this.mTokenizerFactory);
            chunkTag(charArray, start, end, type, arrayList, arrayList2, arrayList3, this.mTokenizerFactory);
            i = end;
        }
        outTag(charArray, i, charSequence.length(), arrayList, arrayList2, arrayList3, this.mTokenizerFactory);
        handle((String[]) arrayList.toArray(Strings.EMPTY_STRING_ARRAY), (String[]) arrayList2.toArray(Strings.EMPTY_STRING_ARRAY), (String[]) arrayList3.toArray(Strings.EMPTY_STRING_ARRAY));
    }

    void handle(String[] strArr, String[] strArr2, String[] strArr3) {
        getHmmEstimator().handle(new Tagging<>(Arrays.asList(strArr), Arrays.asList(trainNormalize(strArr3))));
        smoothTags(strArr3);
    }

    @Override // com.aliasi.util.Compilable
    public void compileTo(ObjectOutput objectOutput) throws IOException {
        objectOutput.writeObject(new Externalizer(this));
    }

    public String toString() {
        StringBuilder sb = new StringBuilder();
        TreeSet treeSet = new TreeSet();
        treeSet.add("MM_O");
        treeSet.add("WW_O_BOS");
        treeSet.add("BB_O_BOS");
        treeSet.add("EE_O_BOS");
        Iterator<String> it = this.mTagSet.iterator();
        while (it.hasNext()) {
            String obj = it.next().toString();
            treeSet.add(BioTagChunkCodec.BEGIN_TAG_PREFIX + obj);
            treeSet.add("M_" + obj);
            treeSet.add("E_" + obj);
            treeSet.add("W_" + obj);
            treeSet.add("BB_O_" + obj);
            treeSet.add("EE_O_" + obj);
            treeSet.add("WW_O_" + obj);
        }
        Iterator it2 = treeSet.iterator();
        while (it2.hasNext()) {
            String obj2 = ((String) it2.next()).toString();
            sb.append("\n");
            sb.append("start(" + obj2 + ")=" + this.mHmmEstimator.startLog2Prob(obj2));
            sb.append("\n");
            sb.append("  end(" + obj2 + ")=" + this.mHmmEstimator.endLog2Prob(obj2));
            sb.append("\n");
            Iterator it3 = treeSet.iterator();
            while (it3.hasNext()) {
                String obj3 = ((String) it3.next()).toString();
                sb.append("trans(" + obj2 + "," + obj3 + ")=" + this.mHmmEstimator.transitLog2Prob(obj2, obj3));
                sb.append("\n");
            }
        }
        return sb.toString();
    }

    void smoothBoundaries() {
        AbstractHmmEstimator hmmEstimator = getHmmEstimator();
        SymbolTable stateSymbolTable = hmmEstimator.stateSymbolTable();
        stateSymbolTable.getOrAddSymbol("BB_O_BOS");
        stateSymbolTable.getOrAddSymbol("MM_O");
        stateSymbolTable.getOrAddSymbol("EE_O_BOS");
        stateSymbolTable.getOrAddSymbol("WW_O_BOS");
        hmmEstimator.trainStart("BB_O_BOS");
        hmmEstimator.trainStart("WW_O_BOS");
        hmmEstimator.trainEnd("EE_O_BOS");
        hmmEstimator.trainEnd("WW_O_BOS");
        hmmEstimator.trainTransit("BB_O_BOS", "MM_O");
        hmmEstimator.trainTransit("BB_O_BOS", "EE_O_BOS");
        hmmEstimator.trainTransit("MM_O", "MM_O");
        hmmEstimator.trainTransit("MM_O", "EE_O_BOS");
    }

    void smoothTags(String[] strArr) {
        if (this.mSmoothTags) {
            AbstractHmmEstimator hmmEstimator = getHmmEstimator();
            SymbolTable stateSymbolTable = hmmEstimator.stateSymbolTable();
            for (String str : strArr) {
                smoothTag(str, stateSymbolTable, hmmEstimator);
            }
        }
    }

    void smoothTag(String str, SymbolTable symbolTable, AbstractHmmEstimator abstractHmmEstimator) {
        smoothBaseTag(HmmChunker.baseTag(str), symbolTable, abstractHmmEstimator);
    }

    void smoothBaseTag(String str, SymbolTable symbolTable, AbstractHmmEstimator abstractHmmEstimator) {
        if (this.mTagSet.add(str) && !"O".equals(str)) {
            String str2 = BioTagChunkCodec.BEGIN_TAG_PREFIX + str;
            String str3 = "M_" + str;
            String str4 = "E_" + str;
            String str5 = "W_" + str;
            String str6 = "BB_O_" + str;
            String str7 = "EE_O_" + str;
            String str8 = "WW_O_" + str;
            symbolTable.getOrAddSymbol(str2);
            symbolTable.getOrAddSymbol(str3);
            symbolTable.getOrAddSymbol(str4);
            symbolTable.getOrAddSymbol(str5);
            symbolTable.getOrAddSymbol(str6);
            symbolTable.getOrAddSymbol(str7);
            symbolTable.getOrAddSymbol(str8);
            abstractHmmEstimator.trainStart(str2);
            abstractHmmEstimator.trainTransit(str2, str3);
            abstractHmmEstimator.trainTransit(str2, str4);
            abstractHmmEstimator.trainTransit(str3, str3);
            abstractHmmEstimator.trainTransit(str3, str4);
            abstractHmmEstimator.trainEnd(str4);
            abstractHmmEstimator.trainTransit(str4, str6);
            abstractHmmEstimator.trainStart(str5);
            abstractHmmEstimator.trainEnd(str5);
            abstractHmmEstimator.trainTransit(str5, str6);
            abstractHmmEstimator.trainTransit(str6, "MM_O");
            abstractHmmEstimator.trainTransit("MM_O", str7);
            abstractHmmEstimator.trainTransit(str7, str2);
            abstractHmmEstimator.trainTransit(str7, str5);
            abstractHmmEstimator.trainStart(str8);
            abstractHmmEstimator.trainTransit(str8, str2);
            abstractHmmEstimator.trainTransit(str8, str5);
            abstractHmmEstimator.trainTransit(str4, "WW_O_BOS");
            abstractHmmEstimator.trainTransit(str5, "WW_O_BOS");
            abstractHmmEstimator.trainTransit(str6, "EE_O_BOS");
            abstractHmmEstimator.trainTransit("BB_O_BOS", str7);
            for (String str9 : this.mTagSet) {
                if (!"O".equals(str9) && !"BOS".equals(str9)) {
                    String str10 = "BB_O_" + str9;
                    String str11 = "WW_O_" + str9;
                    String str12 = "EE_O_" + str9;
                    String str13 = BioTagChunkCodec.BEGIN_TAG_PREFIX + str9;
                    String str14 = "W_" + str9;
                    String str15 = "E_" + str9;
                    abstractHmmEstimator.trainTransit(str4, str11);
                    abstractHmmEstimator.trainTransit(str4, str13);
                    abstractHmmEstimator.trainTransit(str4, str14);
                    abstractHmmEstimator.trainTransit(str5, str11);
                    abstractHmmEstimator.trainTransit(str5, str13);
                    abstractHmmEstimator.trainTransit(str5, str14);
                    abstractHmmEstimator.trainTransit(str15, str2);
                    abstractHmmEstimator.trainTransit(str15, str5);
                    abstractHmmEstimator.trainTransit(str15, str8);
                    abstractHmmEstimator.trainTransit(str14, str2);
                    abstractHmmEstimator.trainTransit(str14, str5);
                    abstractHmmEstimator.trainTransit(str14, str8);
                    abstractHmmEstimator.trainTransit(str6, str12);
                    abstractHmmEstimator.trainTransit(str10, str7);
                }
            }
        }
    }

    static void outTag(char[] cArr, int i, int i2, List<String> list, List<String> list2, List<String> list3, TokenizerFactory tokenizerFactory) {
        Tokenizer tokenizer = tokenizerFactory.tokenizer(cArr, i, i2 - i);
        list2.add(tokenizer.nextWhitespace());
        while (true) {
            String nextToken = tokenizer.nextToken();
            if (nextToken == null) {
                return;
            }
            list.add(nextToken);
            list3.add(ChunkTagHandlerAdapter2.OUT_TAG);
            list2.add(tokenizer.nextWhitespace());
        }
    }

    static void chunkTag(char[] cArr, int i, int i2, String str, List<String> list, List<String> list2, List<String> list3, TokenizerFactory tokenizerFactory) {
        Tokenizer tokenizer = tokenizerFactory.tokenizer(cArr, i, i2 - i);
        list.add(tokenizer.nextToken());
        list3.add(ChunkTagHandlerAdapter2.BEGIN_TAG_PREFIX + str);
        while (true) {
            String nextWhitespace = tokenizer.nextWhitespace();
            String nextToken = tokenizer.nextToken();
            if (nextToken == null) {
                return;
            }
            list.add(nextToken);
            list2.add(nextWhitespace);
            list3.add(ChunkTagHandlerAdapter2.IN_TAG_PREFIX + str);
        }
    }

    public static boolean consistentTokens(String[] strArr, String[] strArr2, TokenizerFactory tokenizerFactory) {
        if (strArr.length + 1 != strArr2.length) {
            return false;
        }
        char[] chars = getChars(strArr, strArr2);
        Tokenizer tokenizer = tokenizerFactory.tokenizer(chars, 0, chars.length);
        if (!strArr2[0].equals(tokenizer.nextWhitespace())) {
            return false;
        }
        for (int i = 0; i < strArr.length; i++) {
            String nextToken = tokenizer.nextToken();
            if (nextToken == null || !strArr[i].equals(nextToken)) {
                return false;
            }
            if (!strArr2[i + 1].equals(tokenizer.nextWhitespace())) {
                return false;
            }
        }
        return true;
    }

    List<String> tokenization(String[] strArr, String[] strArr2) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        char[] chars = getChars(strArr, strArr2);
        this.mTokenizerFactory.tokenizer(chars, 0, chars.length).tokenize(arrayList, arrayList2);
        return arrayList;
    }

    static char[] getChars(String[] strArr, String[] strArr2) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < strArr.length; i++) {
            sb.append(strArr2[i]);
            sb.append(strArr[i]);
        }
        sb.append(strArr2[strArr2.length - 1]);
        return Strings.toCharArray(sb);
    }
}
