/*
 * Decompiled with CFR 0.152.
 */
package org.structr.text;

import java.io.IOException;
import java.io.Writer;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.structr.api.config.Settings;

public class FulltextTokenizer
extends Writer {
    private static final Logger logger = LoggerFactory.getLogger((String)FulltextTokenizer.class.getName());
    public static final Set<Character> SpecialChars = new LinkedHashSet<Character>();
    private final int wordCountLimit = (Integer)Settings.IndexingLimit.getValue();
    private final int wordMinLength = (Integer)Settings.IndexingMinLength.getValue();
    private final int wordMaxLength = (Integer)Settings.IndexingMaxLength.getValue();
    private final StringBuilder rawText = new StringBuilder();
    private final StringBuilder wordBuffer = new StringBuilder();
    private final Set<String> words = new LinkedHashSet<String>();
    private String language = "de";
    private String fileName = null;
    private char lastCharacter = '\u0000';
    private int consecutiveCharCount = 0;
    private int wordCount = 0;

    public FulltextTokenizer(String fileName) {
        this.fileName = fileName;
    }

    @Override
    public void write(char[] cbuf, int off, int len) throws IOException {
        if (this.wordCount < this.wordCountLimit) {
            int limit = off + len;
            int length = Math.min(limit, cbuf.length);
            for (int i = off; i < length; ++i) {
                char c = cbuf[i];
                if (c == this.lastCharacter) {
                    if (this.consecutiveCharCount++ >= 10) {
                        continue;
                    }
                } else {
                    this.consecutiveCharCount = 0;
                }
                if (!(Character.isAlphabetic(c) || Character.isDigit(c) || SpecialChars.contains(Character.valueOf(c)))) {
                    this.flush();
                    if (Character.isWhitespace(c)) {
                        this.rawText.append(c);
                    } else {
                        this.rawText.append(" ");
                    }
                } else {
                    this.wordBuffer.append(c);
                    this.rawText.append(c);
                }
                this.lastCharacter = c;
            }
        }
    }

    public String getLanguage() {
        return this.language;
    }

    public String getRawText() {
        return this.rawText.toString();
    }

    public Set<String> getWords() {
        return this.words;
    }

    @Override
    public void flush() throws IOException {
        String word = this.wordBuffer.toString().trim();
        if (StringUtils.isNotBlank((String)word)) {
            if (word.contains(".") || word.contains(",")) {
                if (word.matches("[\\-0-9\\.,]+")) {
                    this.addWord(word);
                } else {
                    String[] parts = word.split("[\\.,]+");
                    int len = parts.length;
                    for (int i = 0; i < len; ++i) {
                        String part = parts[i].trim();
                        if (!StringUtils.isNotBlank((String)part)) continue;
                        this.addWord(part.toLowerCase());
                    }
                }
            } else {
                this.addWord(word.toLowerCase());
            }
        }
        this.wordBuffer.setLength(0);
    }

    @Override
    public void close() throws IOException {
        this.flush();
        LanguageIdentifier identifier = new LanguageIdentifier(this.rawText.toString());
        if (identifier.isReasonablyCertain()) {
            this.language = identifier.getLanguage();
        }
    }

    public int getWordCount() {
        return this.wordCount;
    }

    private void addWord(String word) {
        int length = word.length();
        if (length >= this.wordMinLength && length <= this.wordMaxLength) {
            this.words.add(word);
            ++this.wordCount;
            if (this.wordCount > this.wordCountLimit) {
                logger.info("Indexing word count of {} reached for {}, no more words will be indexed. Set {} in structr.conf to increase this limit.", new Object[]{this.wordCountLimit, this.fileName, Settings.IndexingLimit.getKey()});
            }
        }
    }

    static {
        SpecialChars.add(Character.valueOf('_'));
        SpecialChars.add(Character.valueOf('\u00e4'));
        SpecialChars.add(Character.valueOf('\u00f6'));
        SpecialChars.add(Character.valueOf('\u00fc'));
        SpecialChars.add(Character.valueOf('\u00c4'));
        SpecialChars.add(Character.valueOf('\u00d6'));
        SpecialChars.add(Character.valueOf('\u00dc'));
        SpecialChars.add(Character.valueOf('\u00df'));
        SpecialChars.add(Character.valueOf('\u00a7'));
        SpecialChars.add(Character.valueOf('-'));
        SpecialChars.add(Character.valueOf('%'));
        SpecialChars.add(Character.valueOf('/'));
        SpecialChars.add(Character.valueOf('@'));
        SpecialChars.add(Character.valueOf('$'));
        SpecialChars.add(Character.valueOf('\u20ac'));
        SpecialChars.add(Character.valueOf('\u00e6'));
        SpecialChars.add(Character.valueOf('\u00a2'));
        SpecialChars.add(Character.valueOf('.'));
        SpecialChars.add(Character.valueOf(','));
        SpecialChars.add(Character.valueOf('\''));
        SpecialChars.add(Character.valueOf('\"'));
        SpecialChars.add(Character.valueOf('`'));
    }
}

