Skip to content

Commit

Permalink
Merge branch 'TIKA-2520' of https://github.com/mbaechler/tika into br…
Browse files Browse the repository at this point in the history
…anch_1x
  • Loading branch information
chrismattmann committed May 24, 2018
2 parents cdca0f7 + 124a06d commit 7e3e34c
Showing 1 changed file with 36 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import org.apache.tika.language.detect.LanguageNames;
import org.apache.tika.language.detect.LanguageResult;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
Expand All @@ -44,39 +46,59 @@
*/
public class OptimaizeLangDetector extends LanguageDetector {

private static final List<LanguageProfile> DEFAULT_LANGUAGE_PROFILES;
private static final ImmutableSet<String> DEFAULT_LANGUAGES;
private static final com.optimaize.langdetect.LanguageDetector DEFAULT_DETECTOR;


static {
try {
DEFAULT_LANGUAGE_PROFILES = ImmutableList.copyOf(new LanguageProfileReader().readAllBuiltIn());

ImmutableSet.Builder<String> builder = new ImmutableSet.Builder<>();
for (LanguageProfile profile : DEFAULT_LANGUAGE_PROFILES) {
builder.add(makeLanguageName(profile.getLocale()));
}
DEFAULT_LANGUAGES = builder.build();

DEFAULT_DETECTOR = createDetector(DEFAULT_LANGUAGE_PROFILES, null);
} catch (IOException e) {
throw new RuntimeException("can't initialize OptimaizeLangDetector");
}
}

private static final int MAX_CHARS_FOR_DETECTION = 20000;
private static final int MAX_CHARS_FOR_SHORT_DETECTION = 200;

private com.optimaize.langdetect.LanguageDetector detector;
private CharArrayWriter writer;
private Set<String> languages;
private Map<String, Float> languageProbabilities;

public OptimaizeLangDetector() {
super();

writer = new CharArrayWriter(MAX_CHARS_FOR_DETECTION);
}

@Override
public LanguageDetector loadModels() throws IOException {
List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();

public LanguageDetector loadModels() {
// FUTURE when the "language-detector" project supports short profiles, check if
// isShortText() returns true and switch to those.

languages = new HashSet<>();
for (LanguageProfile profile : languageProfiles) {
languages.add(makeLanguageName(profile.getLocale()));

languages = DEFAULT_LANGUAGES;

if (languageProbabilities != null) {
detector = createDetector(DEFAULT_LANGUAGE_PROFILES, languageProbabilities);
} else {
detector = DEFAULT_DETECTOR;
}

detector = createDetector(languageProfiles);


return this;

}

private String makeLanguageName(LdLocale locale) {
private static String makeLanguageName(LdLocale locale) {
return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), locale.getRegion().orNull());
}

Expand All @@ -98,12 +120,12 @@ public LanguageDetector loadModels(Set<String> languages) throws IOException {
}
}

detector = createDetector(new LanguageProfileReader().readBuiltIn(locales));
detector = createDetector(new LanguageProfileReader().readBuiltIn(locales), languageProbabilities);

return this;
}

private com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles) {
private static com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles, Map<String, Float> languageProbabilities) {
// FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which
// means you can often get 0 probabilities. So we pick a very short length for this limit.
LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard())
Expand Down

0 comments on commit 7e3e34c

Please # to comment.