CogComp · danyaljj · Nov 10, 2017 · Nov 10, 2017 · Nov 10, 2017 · Nov 11, 2017
diff --git a/big-data-utils/pom.xml b/big-data-utils/pom.xml
@@ -3,7 +3,7 @@
     <parent>
         <artifactId>illinois-cogcomp-nlp</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>3.1.36</version>
+        <version>3.1.39</version>
     </parent>
 
     <modelVersion>4.0.0</modelVersion>
@@ -23,7 +23,7 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-core-utilities</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>org.xeustechnologies.google-api</groupId>

diff --git a/chunker/pom.xml b/chunker/pom.xml
@@ -2,7 +2,7 @@
     <parent>
         <artifactId>illinois-cogcomp-nlp</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>3.1.36</version>
+        <version>3.1.39</version>
     </parent>
 
     <modelVersion>4.0.0</modelVersion>
@@ -13,7 +13,7 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-core-utilities</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
 
         <dependency>
@@ -24,12 +24,12 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>LBJava-NLP-tools</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-pos</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
@@ -44,7 +44,7 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-curator</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>

diff --git a/commasrl/pom.xml b/commasrl/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <artifactId>illinois-cogcomp-nlp</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>3.1.36</version>
+        <version>3.1.39</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
@@ -35,48 +35,48 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-core-utilities</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
             <optional>true</optional>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-curator</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-tokenizer</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-corpusreaders</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-inference</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>stanford_3.3.1</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-pos</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-ner</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-chunker</artifactId>
-            <version>3.1.36</version>
+            <version>3.1.39</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>

diff --git a/core-utilities/pom.xml b/core-utilities/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>illinois-cogcomp-nlp</artifactId>
         <groupId>edu.illinois.cs.cogcomp</groupId>
-        <version>3.1.36</version>
+        <version>3.1.39</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 

diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/Lexicon.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/Lexicon.java
@@ -7,6 +7,8 @@
  */
 package edu.illinois.cs.cogcomp.core.datastructures;
 
+import gnu.trove.list.TIntList;
+import gnu.trove.list.array.TIntArrayList;
 import gnu.trove.map.hash.TIntFloatHashMap;
 import gnu.trove.map.hash.TIntIntHashMap;
 import gnu.trove.procedure.TIntIntProcedure;
@@ -17,12 +19,13 @@
 import java.io.*;
 import java.util.*;
 import java.util.Map.Entry;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 
 /**
  * A lexicon manager that manages features. Stores a hash value for string features and maps to an
- * integer id. Optionally stores the string values too. Method previewFeature( String ) gets the
+ * integer id. Optionally stores the string values too.
  *
  * @author Vivek Srikumar
  */
@@ -72,6 +75,10 @@ public Lexicon(InputStream in) throws IOException {
         this(in, false);
     }
 
+    public Lexicon(File f, boolean loadStrings) throws IOException {
+        this(new FileInputStream(f), loadStrings);
+    }
+
     public Lexicon(InputStream in, boolean loadStrings) throws IOException {
         GZIPInputStream zipin = new GZIPInputStream(in);
 
@@ -142,6 +149,10 @@ public String lookupName(int id) {
         return featureNames.get(id);
     }
 
+    public List<String> getFeatureNames() {
+        return this.featureNames;
+    }
+
     /**
      * Increment the count for featureId.
      */
@@ -175,15 +186,13 @@ public synchronized void previewFeature(String f) {
 
         // If there is a hash collision, print a warning
         if (feature2Id.containsKey(featureHash)) {
-            logger.warn("Possible hash collision in lexicon " + "for feature name = {}, hash = {}", f,
+            logger.warn("Possible hash collision in lexicon for feature name = {}, hash = {}", f,
                     featureHash);
         } else {
-
             feature2Id.put(featureHash, nextFeatureId++);
-        }
-
-        if (featureNames != null) {
-            featureNames.add(f);
+            if (featureNames != null) {
+                featureNames.add(f);
+            }
         }
     }
 
@@ -249,6 +258,27 @@ public Pair<int[], float[]> getFeatureVector(Map<String, Float> featureMap) {
         return new Pair<>(ids, vals);
     }
 
+    /**
+     * generate a feature id representation given a set of features given as input
+     * @param features set of active features
+     * @return a feature sparse representation of the features
+     */
+    public int[] getFeatureVector(List<String> features) {
+        TIntList feats = new TIntArrayList();
+        for (String f : features) {
+            if (!contains(f))
+                continue;
+            int id = lookupId(f);
+            if (!feats.contains(id))
+                feats.add(id);
+        }
+        return feats.toArray();
+    }
+
+    public TIntIntHashMap getFeatureMap() {
+        return feature2Id;
+    }
+
     public Pair<int[], float[]> pruneFeaturesByCount(int[] idx, float[] fs, int threshold) {
         int[] array = new int[idx.length];
         float[] vals = new float[array.length];
@@ -314,19 +344,15 @@ public void save(String file) throws IOException {
 
         writeInt(writer, feature2Id.size());
 
-        feature2Id.forEachEntry(new TIntIntProcedure() {
-
-            @Override
-            public boolean execute(int a, int b) {
-                try {
-                    writeInt(writer, a);
-                    writeInt(writer, b);
+        feature2Id.forEachEntry((hash, id) -> {
+            try {
+                writeInt(writer, hash);
+                writeInt(writer, id);
 
-                } catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
-                return true;
+            } catch (IOException e) {
+                throw new RuntimeException(e);
             }
+            return true;
         });
 
         if (featureNames != null) {
@@ -354,21 +380,43 @@ private void writeInt(BufferedWriter writer, int integer) throws IOException {
 
     /***
      * prunes the lexicon by removing features with less than threshold many counts
+     * If true, it would include the feature counts in the new generated lexicon
+     * @param keepCounts whether to keep the feature counts in the pruned feature map or not.
+     * @param resetFeatureIds this would map features to another counting, starting from zero. This is usefull in
+     *                        the cases where pruning drops many of the features, and leaves many of the ids unused.
      */
-    public Lexicon getPrunedLexicon(final int threshold) {
-        final Lexicon lex = new Lexicon(false, false);
-
-        this.feature2Id.forEachEntry(new TIntIntProcedure() {
+    public Lexicon getPrunedLexicon(final int threshold, boolean keepCounts, boolean resetFeatureIds, boolean hasBias, boolean storeStrings) {
+        final Lexicon lex = new Lexicon(hasBias, storeStrings);
 
-            @Override
-            public boolean execute(int hash, int id) {
+        AtomicInteger nextId = new AtomicInteger(-1);
 
-                if (featureCounts.get(id) > threshold)
-                    lex.feature2Id.put(hash, id);
-                return true;
+        this.feature2Id.forEachEntry((hash, id) -> {
+            String featureName = "";
+            if(storeStrings && this.featureNames != null) {
+                featureName = this.featureNames.get(id);
             }
+            int count = featureCounts.get(id);
+            if (count > threshold) {
+                int newId;
+                if(resetFeatureIds)
+                    newId = nextId.incrementAndGet();
+                else
+                    newId = id;
+                lex.feature2Id.put(hash, newId);
+                if(keepCounts) lex.featureCounts.put(newId, count);
+                if(storeStrings && this.featureNames != null) {
+                    // expand the
+                    for (int i = lex.featureNames.size(); i <= newId; i++)
+                        lex.featureNames.add("");
+                    lex.featureNames.set(newId, featureName);
+                }
+            }
+            return true;
         });
-        lex.nextFeatureId = this.nextFeatureId;
+        if(resetFeatureIds)
+            lex.nextFeatureId = nextId.incrementAndGet();
+        else
+            lex.nextFeatureId = this.nextFeatureId;
 
         logger.info("Number of features after pruning: " + lex.size());