Skip to content

WIP: feature extractors #587

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions big-data-utils/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand All @@ -23,7 +23,7 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>org.xeustechnologies.google-api</groupId>
Expand Down
10 changes: 5 additions & 5 deletions chunker/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand All @@ -13,7 +13,7 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>

<dependency>
Expand All @@ -24,12 +24,12 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>LBJava-NLP-tools</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-pos</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
Expand All @@ -44,7 +44,7 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-curator</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
Expand Down
20 changes: 10 additions & 10 deletions commasrl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down Expand Up @@ -35,48 +35,48 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-curator</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-tokenizer</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-corpusreaders</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-inference</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>stanford_3.3.1</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-pos</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-ner</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-chunker</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
Expand Down
2 changes: 1 addition & 1 deletion core-utilities/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
*/
package edu.illinois.cs.cogcomp.core.datastructures;

import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntFloatHashMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.procedure.TIntIntProcedure;
Expand All @@ -17,12 +19,13 @@
import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

/**
* A lexicon manager that manages features. Stores a hash value for string features and maps to an
* integer id. Optionally stores the string values too. Method previewFeature( String ) gets the
* integer id. Optionally stores the string values too.
*
* @author Vivek Srikumar
*/
Expand Down Expand Up @@ -72,6 +75,10 @@ public Lexicon(InputStream in) throws IOException {
this(in, false);
}

public Lexicon(File f, boolean loadStrings) throws IOException {
this(new FileInputStream(f), loadStrings);
}

public Lexicon(InputStream in, boolean loadStrings) throws IOException {
GZIPInputStream zipin = new GZIPInputStream(in);

Expand Down Expand Up @@ -142,6 +149,10 @@ public String lookupName(int id) {
return featureNames.get(id);
}

public List<String> getFeatureNames() {
return this.featureNames;
}

/**
* Increment the count for featureId.
*/
Expand Down Expand Up @@ -175,15 +186,13 @@ public synchronized void previewFeature(String f) {

// If there is a hash collision, print a warning
if (feature2Id.containsKey(featureHash)) {
logger.warn("Possible hash collision in lexicon " + "for feature name = {}, hash = {}", f,
logger.warn("Possible hash collision in lexicon for feature name = {}, hash = {}", f,
featureHash);
} else {

feature2Id.put(featureHash, nextFeatureId++);
}

if (featureNames != null) {
featureNames.add(f);
if (featureNames != null) {
featureNames.add(f);
}
}
}

Expand Down Expand Up @@ -249,6 +258,27 @@ public Pair<int[], float[]> getFeatureVector(Map<String, Float> featureMap) {
return new Pair<>(ids, vals);
}

/**
* generate a feature id representation given a set of features given as input
* @param features set of active features
* @return a feature sparse representation of the features
*/
public int[] getFeatureVector(List<String> features) {
TIntList feats = new TIntArrayList();
for (String f : features) {
if (!contains(f))
continue;
int id = lookupId(f);
if (!feats.contains(id))
feats.add(id);
}
return feats.toArray();
}

public TIntIntHashMap getFeatureMap() {
return feature2Id;
}

public Pair<int[], float[]> pruneFeaturesByCount(int[] idx, float[] fs, int threshold) {
int[] array = new int[idx.length];
float[] vals = new float[array.length];
Expand Down Expand Up @@ -314,19 +344,15 @@ public void save(String file) throws IOException {

writeInt(writer, feature2Id.size());

feature2Id.forEachEntry(new TIntIntProcedure() {

@Override
public boolean execute(int a, int b) {
try {
writeInt(writer, a);
writeInt(writer, b);
feature2Id.forEachEntry((hash, id) -> {
try {
writeInt(writer, hash);
writeInt(writer, id);

} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
});

if (featureNames != null) {
Expand Down Expand Up @@ -354,21 +380,43 @@ private void writeInt(BufferedWriter writer, int integer) throws IOException {

/***
* prunes the lexicon by removing features with less than threshold many counts
* If true, it would include the feature counts in the new generated lexicon
* @param keepCounts whether to keep the feature counts in the pruned feature map or not.
* @param resetFeatureIds this would map features to another counting, starting from zero. This is usefull in
* the cases where pruning drops many of the features, and leaves many of the ids unused.
*/
public Lexicon getPrunedLexicon(final int threshold) {
final Lexicon lex = new Lexicon(false, false);

this.feature2Id.forEachEntry(new TIntIntProcedure() {
public Lexicon getPrunedLexicon(final int threshold, boolean keepCounts, boolean resetFeatureIds, boolean hasBias, boolean storeStrings) {
final Lexicon lex = new Lexicon(hasBias, storeStrings);

@Override
public boolean execute(int hash, int id) {
AtomicInteger nextId = new AtomicInteger(-1);

if (featureCounts.get(id) > threshold)
lex.feature2Id.put(hash, id);
return true;
this.feature2Id.forEachEntry((hash, id) -> {
String featureName = "";
if(storeStrings && this.featureNames != null) {
featureName = this.featureNames.get(id);
}
int count = featureCounts.get(id);
if (count > threshold) {
int newId;
if(resetFeatureIds)
newId = nextId.incrementAndGet();
else
newId = id;
lex.feature2Id.put(hash, newId);
if(keepCounts) lex.featureCounts.put(newId, count);
if(storeStrings && this.featureNames != null) {
// expand the
for (int i = lex.featureNames.size(); i <= newId; i++)
lex.featureNames.add("");
lex.featureNames.set(newId, featureName);
}
}
return true;
});
lex.nextFeatureId = this.nextFeatureId;
if(resetFeatureIds)
lex.nextFeatureId = nextId.incrementAndGet();
else
lex.nextFeatureId = this.nextFeatureId;

logger.info("Number of features after pruning: " + lex.size());

Expand Down
Loading