Skip to content

Commit

Permalink
Stopwords Fix PR (#1012)
Browse files Browse the repository at this point in the history
* telemetry fix

* comment fixes

* comments fix

* junits

* comment fixes

* comment fixes

* comment fixes

* comment fixes

* comment fixes and refactor code

* comment fixes and refactor code

* pmd comments

* adding asserts

* Update TestStopWordUtility.java

* updating asserts
  • Loading branch information
sania-16 authored Feb 7, 2025
1 parent 5cdff3f commit f01b55f
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 11 deletions.
11 changes: 11 additions & 0 deletions common/client/src/main/java/zingg/common/client/HasStopWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package zingg.common.client;

import scala.Serializable;

public class HasStopWords implements Serializable {

public static boolean isStopwordField(FieldDefinition f){
return (!(f.getStopWords() == null || f.getStopWords() == ""));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import zingg.common.client.FieldDefinition;
import zingg.common.client.IArguments;
import zingg.common.client.IZArgs;
import zingg.common.client.MatchType;
import zingg.common.client.ZFrame;
import zingg.common.client.ZinggClientException;
Expand Down Expand Up @@ -251,7 +250,7 @@ public List<FieldDefinition> getFieldDefinitionFiltered(IArguments args, MatchT
.stream()
.filter(f -> !(f.getMatchType() == null || f.getMatchType().contains(type)))
.collect(Collectors.toList());
}
}

public ZFrame<D,R,C> postprocess(ZFrame<D,R,C> actual, ZFrame<D,R,C> orig) {
List<C> cols = new ArrayList<C>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package zingg.common.client.util;

import java.io.Serializable;
import java.util.List;
import java.util.stream.Collectors;

import zingg.common.client.FieldDefinition;
import zingg.common.client.HasStopWords;
import zingg.common.client.IArguments;

public class StopWordUtility implements Serializable {

private static final long serialVersionUID = 1L;

public List<? extends FieldDefinition> getFieldDefinitionWithStopwords(List<? extends FieldDefinition> fieldDefinition) {

return fieldDefinition.stream()
.filter(f -> HasStopWords.isStopwordField(f))
.collect(Collectors.toList());
}

public String getFieldDefinitionNamesWithStopwords(IArguments args) {

return getFieldDefinitionWithStopwords(args.getFieldDefinition()).stream()
.map(FieldDefinition::getName)
.collect(Collectors.joining(", "));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package zingg.common.client.util;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.util.ArrayList;
import java.util.List;

import org.junit.jupiter.api.Test;

import zingg.common.client.Arguments;
import zingg.common.client.FieldDefinition;
import zingg.common.client.IArguments;
import zingg.common.client.MatchType;
import zingg.common.client.ZinggClientException;

public class TestStopWordUtility {

@Test
public void testGetFieldDefinitionWithStopwords(){
try {
FieldDefinition def1 = new FieldDefinition();
def1.setFieldName("field1");
def1.setDataType("string");
def1.setMatchTypeInternal(MatchType.FUZZY);
def1.setFields("field1");

FieldDefinition def2 = new FieldDefinition();
def2.setFieldName("field2");
def2.setDataType("string");
def2.setMatchTypeInternal(MatchType.EXACT);
def2.setStopWords("stopWordsFileName2");
def2.setFields("field2");

FieldDefinition def3 = new FieldDefinition();
def3.setFieldName("field3");
def3.setDataType("string");
def3.setMatchTypeInternal(MatchType.FUZZY);
def3.setStopWords(null);
def3.setFields("field3");

List<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>();
fieldDef.add(def1);
fieldDef.add(def2);
fieldDef.add(def3);

List<? extends FieldDefinition> stopWordList = new StopWordUtility().getFieldDefinitionWithStopwords(fieldDef);
assertEquals(1,stopWordList.size());
assertEquals("field2", stopWordList.get(0).getName());

} catch (Exception e) {
e.printStackTrace();

}

}

@Test
public void testGetFieldDefinitionNamesWithStopwords() throws ZinggClientException{
FieldDefinition def1 = new FieldDefinition();
def1.setFieldName("field1");
def1.setDataType("string");
def1.setMatchTypeInternal(MatchType.FUZZY);
def1.setFields("field1");

FieldDefinition def2 = new FieldDefinition();
def2.setFieldName("field2");
def2.setDataType("string");
def2.setMatchTypeInternal(MatchType.EXACT);
def2.setStopWords("stopWordsFileName2");
def2.setFields("field2");

FieldDefinition def3 = new FieldDefinition();
def3.setFieldName("field3");
def3.setDataType("string");
def3.setMatchTypeInternal(MatchType.FUZZY);
def3.setStopWords("stopWordsFileName3");
def3.setFields("field3");

List<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>();
fieldDef.add(def1);
fieldDef.add(def2);
fieldDef.add(def3);
IArguments args = null;
try {
args = new Arguments();
args.setFieldDefinition(fieldDef);
} catch (Exception e) {
e.printStackTrace();
}

String result = new StopWordUtility().getFieldDefinitionNamesWithStopwords(args);
assertEquals("field2, field3", result);

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import zingg.common.client.ZinggClientException;
import zingg.common.client.util.ColName;
import zingg.common.client.util.ColValues;
import zingg.common.client.util.StopWordUtility;
import zingg.common.core.context.IContext;
import zingg.common.core.util.Analytics;
import zingg.common.core.util.Metric;
Expand Down Expand Up @@ -71,23 +72,17 @@ public void setSession(S s) {
}




public void track( boolean collectMetrics){
public void track(boolean collectMetrics){
Analytics.track(Metric.TOTAL_FIELDS_COUNT, args.getFieldDefinition().size(), collectMetrics);
Analytics.track(Metric.MATCH_FIELDS_COUNT, getDSUtil().getFieldDefinitionFiltered(args, MatchType.DONT_USE).size(),
collectMetrics);
Analytics.track(Metric.MATCH_FIELDS_COUNT, getDSUtil().getFieldDefinitionFiltered(args, MatchType.DONT_USE).size(), collectMetrics);
Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics);
Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics);
Analytics.track(Metric.MODEL_ID, args.getModelId(), collectMetrics);

Analytics.track(Metric.STOPWORDS,new StopWordUtility().getFieldDefinitionNamesWithStopwords(args), collectMetrics);

}





public IContext<S,D,R,C,T> getContext() {
return this.context;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class Metric {
public static final String TRAINING_MATCHES = "trainingDataMatches";
public static final String TRAINING_NONMATCHES = "trainingDataNonmatches";
public static final String DATA_COUNT = "dataCount";
public static final String STOPWORDS = "stopWords";

public static final long timeout = 1200L;
public static final double confidence = 0.95; // default value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,5 @@ public void testGetFieldDefColumnsWhenShowConciseIsFalse() throws ZinggClientExc
assertTrue(expectedColumnsTest2.get(i).contains(colListTest2.get(i).toString()));
};
}

}

0 comments on commit f01b55f

Please # to comment.