diff --git a/Dockerfile b/Dockerfile
index 978bbfd52..bdfe8cfb5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,17 +4,17 @@ USER 0
RUN apt-get update && \
apt install -y curl vim
ENV SPARK_MASTER local[*]
-ENV ZINGG_HOME /zingg-0.5.0
+ENV ZINGG_HOME /zingg-0.6.0
ENV PATH $ZINGG_HOME/scripts:$PATH
ENV LANG C.UTF-8
WORKDIR /
USER root
-WORKDIR /zingg-0.5.0
-RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.5.0/zingg-0.5.0-spark-3.5.0.tar.gz | \
+WORKDIR /zingg-0.6.0
+RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.6.0/zingg-0.6.0-spark-3.6.0.tar.gz | \
tar --extract --gzip --strip=1
RUN pip install -r python/requirements.txt
RUN pip install zingg
-RUN chmod -R a+rwx /zingg-0.5.0/models
-RUN chown -R spark /zingg-0.5.0/models
+RUN chmod -R a+rwx /zingg-0.6.0/models
+RUN chown -R spark /zingg-0.6.0/models
USER spark
diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java
index 114221e0d..9faa8bd06 100644
--- a/common/client/src/main/java/zingg/common/client/Client.java
+++ b/common/client/src/main/java/zingg/common/client/Client.java
@@ -59,7 +59,7 @@ public Client(IZArgs args, ClientOptions options, String zFactory) throws ZinggC
setOptions(options);
try {
buildAndSetArguments(args, options);
- setZingg(args, options);
+ setZingg(options);
}
catch (Exception e) {
throw new ZinggClientException("An error has occured while setting up the client", e);
@@ -91,14 +91,14 @@ public IZinggFactory getZinggFactory() throws InstantiationException, IllegalAcc
- public void setZingg(IZArgs args, ClientOptions options) throws Exception{
+ public void setZingg(ClientOptions options) throws Exception{
IZinggFactory zf = getZinggFactory();
try{
setZingg(zf.get(ZinggOptions.getByValue(options.get(ClientOptions.PHASE).value.trim())));
}
catch(Exception e) {
- //set default
- setZingg(zf.get(ZinggOptions.getByValue(ZinggOptions.PEEK_MODEL.getName())));
+ LOG.error("Error creating zingg instance for phase " + options.get(ClientOptions.PHASE).value.trim(), e);
+ throw e;
}
}
diff --git a/common/client/src/main/java/zingg/common/client/MatchTypes.java b/common/client/src/main/java/zingg/common/client/MatchTypes.java
index 3edd727fe..cbd552369 100644
--- a/common/client/src/main/java/zingg/common/client/MatchTypes.java
+++ b/common/client/src/main/java/zingg/common/client/MatchTypes.java
@@ -42,13 +42,13 @@ public static String[] getAllMatchTypes() {
return s;
}
- public static IMatchType getByName(String name) throws Exception{
+ public static IMatchType getByName(String name) throws IllegalArgumentException{
for (IMatchType zo: MatchTypes.allMatchTypes.values()) {
if (zo.getName().equalsIgnoreCase(name)) {
return zo;
}
}
- return null;
+ throw new IllegalArgumentException("Invalid match type: " + name);
}
}
diff --git a/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java b/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java
index 64ea1a5a5..7658ce8e1 100644
--- a/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java
+++ b/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java
@@ -45,7 +45,7 @@ public A loadArguments(String path) throws ZinggClientException, NoSuchObjectExc
@Override
public void writeArguments(String path, IZArgs args) throws ZinggClientException, NoSuchObjectException {
- ArgumentsWriter argumentsWriter = writerFactory.getArgumentsWriter(WriterType.JSON);
+ ArgumentsWriter argumentsWriter = writerFactory.getArgumentsWriter(WriterType.FILE);
argumentsWriter.write(path, args);
}
diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java
index cbca5647b..bc30825da 100644
--- a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java
+++ b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java
@@ -1,11 +1,11 @@
package zingg.common.client.options;
-import java.util.HashMap;
-import java.util.Map;
-
import zingg.common.client.ZinggClientException;
import zingg.common.client.util.Util;
+import java.util.HashMap;
+import java.util.Map;
+
public class ZinggOptions {
public final static ZinggOption TRAIN = new ZinggOption("train");
@@ -18,9 +18,6 @@ public class ZinggOptions {
public final static ZinggOption RECOMMEND = new ZinggOption("recommend");
public final static ZinggOption UPDATE_LABEL = new ZinggOption("updateLabel");
public final static ZinggOption FIND_AND_LABEL = new ZinggOption("findAndLabel");
- public final static ZinggOption ASSESS_MODEL = new ZinggOption("assessModel");
- public final static ZinggOption PEEK_MODEL = new ZinggOption("peekModel");
- public final static ZinggOption EXPORT_MODEL = new ZinggOption("exportModel");
public static Map allZinggOptions;// = new HashMap();
diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java
index f0961907b..0e26a2593 100644
--- a/common/client/src/test/java/zingg/common/client/TestArguments.java
+++ b/common/client/src/test/java/zingg/common/client/TestArguments.java
@@ -1,5 +1,14 @@
package zingg.common.client;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.jupiter.api.Test;
+import zingg.common.client.arguments.ArgumentServiceImpl;
+import zingg.common.client.arguments.IArgumentService;
+import zingg.common.client.arguments.loader.template.EnvironmentVariableSubstitutor;
+import zingg.common.client.arguments.model.Arguments;
+import zingg.common.client.arguments.model.IArguments;
+
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@@ -9,16 +18,8 @@
import java.util.List;
import java.util.Map;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.junit.jupiter.api.Test;
-import zingg.common.client.arguments.ArgumentServiceImpl;
-import zingg.common.client.arguments.IArgumentService;
-import zingg.common.client.arguments.model.Arguments;
-import zingg.common.client.arguments.model.IArguments;
-import zingg.common.client.arguments.loader.template.EnvironmentVariableSubstitutor;
-
-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
public class TestArguments {
diff --git a/common/core/src/test/resources/testPeekModel/config.json b/common/core/src/test/resources/testPeekModel/config.json
deleted file mode 100644
index 07c02b3ec..000000000
--- a/common/core/src/test/resources/testPeekModel/config.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
- "fieldDefinition":[
- {
- "fieldName" : "id",
- "matchType" : "dont_use",
- "fields" : "fname",
- "dataType": "string"
- },
- {
- "fieldName" : "fname",
- "matchType" : "fuzzy",
- "fields" : "fname",
- "dataType": "string"
- },
- {
- "fieldName" : "lname",
- "matchType" : "fuzzy",
- "fields" : "lname",
- "dataType": "string"
- },
- {
- "fieldName" : "stNo",
- "matchType": "exact",
- "fields" : "stNo",
- "dataType": "string"
- },
- {
- "fieldName" : "add1",
- "matchType": "fuzzy",
- "fields" : "add1",
- "dataType": "string"
- },
- {
- "fieldName" : "add2",
- "matchType": "fuzzy",
- "fields" : "add2",
- "dataType": "string"
- },
- {
- "fieldName" : "city",
- "matchType": "fuzzy",
- "fields" : "city",
- "dataType": "string"
- },
- {
- "fieldName" : "areacode",
- "matchType": "exact",
- "fields" : "areacode",
- "dataType": "string"
- },
- {
- "fieldName" : "state",
- "matchType": "exact",
- "fields" : "state",
- "dataType": "string"
- },
- {
- "fieldName" : "dob",
- "matchType": "exact",
- "fields" : "dob",
- "dataType": "string"
- },
- {
- "fieldName" : "ssn",
- "matchType": "exact",
- "fields" : "ssn",
- "dataType": "string"
- }
- ],
- "output" : [{
- "name":"output",
- "format":"csv",
- "props": {
- "path": "/tmp/testPeekModel/zinggOutput",
- "delimiter": ",",
- "header":true
- }
- }],
- "data" : [{
- "name":"test",
- "format":"csv",
- "props": {
- "path": "./testPeekModel/test.csv",
- "delimiter": ",",
- "header":false
- },
- "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string"
- }],
- "labelDataSampleSize" : 0.5,
- "numPartitions":4,
- "modelId": 100,
- "zinggDir": "./testFebrl/models"
-
-}
diff --git a/common/core/src/test/resources/testPeekModel/test.csv b/common/core/src/test/resources/testPeekModel/test.csv
deleted file mode 100644
index 91175cb1c..000000000
--- a/common/core/src/test/resources/testPeekModel/test.csv
+++ /dev/null
@@ -1,65 +0,0 @@
-rec-1020-org, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765
-rec-1021-dup-0, thomas, georgze,1, mcmanus place, , north turarmurra,3130, sa,19630225,5460534
-rec-1021-org, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534
-rec-1022-dup-1, jackson, eglinron,840, mountview, fowles treet, burlei gh heads,2803, sa,19830807,2932837
-rec-1022-dup-2, jackson, eglinton,840, fowles street, moun tvjiew, burleigh heads,2830, ss, ,2932837
-rec-1022-dup-3, jackson, christo,840, fowles street, mou ntveiw, burleig heads,2830, sa,19830807,2932837
-rec-1022-dup-4, jackson, eglinton,840, fowles street, mountv iew, burleigh heads,2830, sa,19830807,2932837
-rec-1022-org, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837
-rec-1023-org, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080
-rec-1024-org, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362
-rec-1025-org, emiily, britten,8, kitchener street, hilltop hostel rowethorpe, lake heights,2463, qld,19491021,9588775
-rec-1026-dup-0, xani, green, , phill ip avenue, , armidale,5108, nsw,19390410,9201057
-rec-1026-dup-1, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201857
-rec-1026-org, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201057
-rec-1027-org, nathan, smallacombe,20, guthridge crescent, red cross units, sandy bay,6056, sa,19241223,7522263
-rec-1028-dup-0, , ,24, , woorinyan, riverwood,3749, qld,19180205,9341716
-rec-1028-dup-1, , eglinton,24, curriecrescent, woorinyan, riverwood,3749, qld,19180205,1909717
-rec-1028-org, , eglinton,24, currie crescent, woorinyan, riverwood,3749, qld,19180205,9341716
-rec-1029-dup-0, kylee, stepehndon,81, rose scott circuit, cordobak anor, ashfield,4226, vic,19461101,4783085
-rec-1029-dup-1, sachin, stephenson,81, rose scott circuit, cordoba manor, ashfi eld,4226, vic,19461101,4783085
-rec-1029-dup-2, annalise, stephenson,81, rose scott circuit, cordoba manor, ashfoeld,4226, vic,19461101,4783085
-rec-1029-dup-3, kykee, turale,81, rose scott circuit, , ashfield,4226, vic,19461101,4783085
-rec-1029-dup-4, kylee, stephenson,81, cordoba manor, rose scott circuit, ashfield,4226, vic,19461101,4783085
-rec-1029-org, kylee, stephenson,81, rose scott circuit, cordoba manor, ashfield,4226, vic,19461101,4783085
-rec-103-dup-0, benjamin, koerbin,15, wybel anah, violet grover place, mill park,2446, nsw,19210210,3808808
-rec-103-org, briony, koerbin,146, violet grover place, wybelanah, mill park,2446, nsw,19210210,3808808
-rec-1030-org, emma, crossman,53, mcdowall place, kellhaven, tara,5608, vic,19391027,3561186
-rec-1031-org, samantha, sabieray,68, quandong street, wattle brae, gorokan,4019, wa,19590807,2863290
-rec-1032-dup-0, brooklyn, naar-cafentas,210, duffy street, tourist psrk, berwick,2481, nsw, ,3624304
-rec-1032-org, brooklyn, naar-cafentas,210, duffy street, tourist park, berwick,2481, nsw,19840802,3624304
-rec-1033-dup-0, keziah, painter,18, ainsli e avenue, sec 1, torquay,3205, vic,19191031,7801066
-rec-1033-org, keziah, painter,18, ainslie avenue, sec 1, torquay,3205, vic,19191031,7801066
-rec-1034-dup-0, erin, maynard,24, , wariala, little river,2777, vic,19970430,7429462
-rec-1034-dup-1, erin, maynard,51, wilshire street, warialda, little irver,2777, vic,19970430,1815999
-rec-1034-dup-2, hayley, maynard,14, wilshire street, , little river,2777, vic,19970430,7429462
-rec-1034-org, erin, maynard,14, wilshire street, warialda, little river,2777, vic,19970430,7429462
-rec-1035-dup-0, jaiden, rollins,48, tulgeywood, rossarden street, balwyn north,2224, nt,19280722,7626396
-rec-1035-dup-1, jaiden, rollins,95, rossarden street, tulgewyood, balwyn north,2224, nt,19280722,7626396
-rec-1035-dup-2, jaiden, rolilns,48, swinden street, tulgeywood, balwyn north,2224, nt,19280722,7626396
-rec-1035-dup-3, jaiden, rolli ns,48, tulgeywomod, rossarden street, balwyn north,2224, nf,19280722,7626396
-rec-1035-org, jaiden, rollins,48, rossarden street, tulgeywood, balwyn north,2224, nt,19280722,7626396
-rec-1036-dup-0, , held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651
-rec-1036-dup-1, sarsha, held,42, lampard circuit, , golden bay,2447, vic,19510806,3710651
-rec-1036-org, amber, held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651
-rec-1037-org, connor, beckwith,10, heard street, , mill park,5031, nsw,19081103,2209091
-rec-1038-org, danny, campbell,95, totterdell street, moama, shellharbour,2209, vic,19951105,9554924
-rec-1039-dup-0, angus, roas,62, gormansto crescent, mlc centre, kiruwah,3350, sa,19250817,2655081
-rec-1039-org, angus, rosa,62, gormanston crescent, mlc centre, kirwan,3350, sa,19250817,2655081
-rec-104-dup-0, benjaminl, carbone,18, arthella, wattle s treet, orange,3550, vic,19050820,3677127
-rec-104-org, benjamin, carbone,18, wattle street, arthella, orange,3550, vic,19050820,3677127
-rec-1040-dup-0, matilda, mestrov, , housecicuit, retirement village, taringa,3820, qld,19801119,2536135
-rec-1040-dup-1, matilda, mestrv,5, house circuit, retirement village, taringa,3802, qld,19801119,2563135
-rec-1040-dup-2, matilda, mestrov,5, house circuit, retiremen tvillage, taringa,3820, ,19801119,2563135
-rec-1040-org, matilda, mestrov,5, house circuit, retirement village, taringa,3820, qld,19801119,2563135
-rec-1041-dup-0, tyler, frojd, , burramurra avenue, kmart p plaza, san rmeo,3670, sa,19800916,7812219
-rec-1041-org, tyler, froud,8, burramurra avenue, kmart p plaza, san remo,3670, sa,19800916,7812219
-rec-1042-dup-0, kiandra, ,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205
-rec-1042-dup-1, kiandra, cowle,2, gatliff place, rustenubr g sth, girgarre,3995, qld,19801125,3328205
-rec-1042-org, kiandra, cowle,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205
-rec-1043-org, giorgia, frahn,62, handasyde street, ramano estate locn 1, tallebudgera,4506, vic,19670206,9724789
-rec-1044-dup-0, nicole, shadbolt,46, schlich s treet, simpson army barracks, toowoomba,3000, wa,19030926,8190756
-rec-1044-dup-1, nicole, carbone,46, schlich nstreet, simpson army barracks, toowoomba,3000, wa,19030926,8190756
-rec-1044-dup-2, nicole, carbone,46, schlich street, simpson arm ybarracks, toowong,3000, wa,19030926,8190756
-rec-1044-dup-3, nicole, carbone,46, schlich street, simpsonary barracks, toowoomba,3000, wa,19030926,8190756
-rec-1044-org, nicole, carbone,46, schlich street, simpson army barracks, toowoomba,3000, wa,19030926,8190756
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index b83a8b29b..8584c3ce3 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -43,7 +43,6 @@
* [Using Pre-existing Training Data](setup/training/addOwnTrainingData.md)
* [Updating Labeled Pairs](updatingLabels.md)
* [Documenting The Training Data](stepbystep/createtrainingdata/generatingdocumentation.md)
- * [Exporting Labeled Data](setup/training/exportLabeledData.md)
* [Model Difference](stepbystep/createtrainingdata/modeldiff.md)
* [Ensuring Scalability](verifyBlocking.md)
* [Building And Saving The Model](setup/train.md)
diff --git a/docs/connectors/jdbc/clickhouse.md b/docs/connectors/jdbc/clickhouse.md
new file mode 100644
index 000000000..076d84f2a
--- /dev/null
+++ b/docs/connectors/jdbc/clickhouse.md
@@ -0,0 +1,53 @@
+# ClickHouse
+
+ClickHouse Pipe Definitions
+JSON settings for reading and writing data using the ClickHouse JDBC driver.
+
+## ClickHouse Input (Reading)
+
+```json
+"data": [
+ {
+ "name": "clickhouse_input",
+ "format": "jdbc",
+ "props": {
+ "url": "jdbc:clickhouse:https://:/?ssl=true",
+ "driver": "com.clickhouse.jdbc.ClickHouseDriver",
+ "user": "",
+ "password": "",
+ "dbtable": ""
+ }
+ }
+]
+```
+
+## ClickHouse Output (Writing)
+
+```json
+"output": [
+ {
+ "name": "clickhouse_output",
+ "format": "jdbc",
+ "props": {
+ "url": "jdbc:clickhouse:https://:/?ssl=true",
+ "driver": "com.clickhouse.jdbc.ClickHouseDriver",
+ "user": "",
+ "password": "",
+ "dbtable": "",
+ "saveMode": "append"
+ }
+ }
+]
+```
+
+## Implementation Steps
+
+### Add the Driver Jar
+Download the `clickhouse-jdbc-0.9.8-all.jar` and add its path to `config/zingg.conf` to ensure Spark can load the driver:
+
+```properties
+spark.jars=/path/to/clickhouse-jdbc-0.9.8-all.jar
+```
+
+### Port
+Use port `8443` for ClickHouse Cloud (HTTPS) or `8123` for local HTTP instances.
diff --git a/docs/setup/training/addOwnTrainingData.md b/docs/setup/training/addOwnTrainingData.md
index b25c1f815..21706faf4 100644
--- a/docs/setup/training/addOwnTrainingData.md
+++ b/docs/setup/training/addOwnTrainingData.md
@@ -17,6 +17,4 @@ Here, the first column specifies the z_cluster, the second column specifies the
The above training data can be specified using [trainingSamples attribute in the configuration.](../../../examples/febrl/configWithTrainingSamples.json)
-In addition, labeled data of one model can also be exported and used as training data for another model. For details, check out [exporting labeled data](exportLabeledData.md).
-
**Note**: It is advisable to still run [findTrainingData](findTrainingData.md) and [label](label.md) a few rounds to tune Zingg with the supplied training data as well as patterns it needs to learn independently.
diff --git a/docs/setup/training/exportLabeledData.md b/docs/setup/training/exportLabeledData.md
deleted file mode 100644
index aaf32faed..000000000
--- a/docs/setup/training/exportLabeledData.md
+++ /dev/null
@@ -1,12 +0,0 @@
----
-parent: Creating training data
-title: Exporting labeled data as csv
-grand_parent: Step By Step Guide
-nav_order: 4
----
-
-# Exporting Labeled Data
-
-If we need to send our labeled data for a subject matter expert to review or if we want to build another model in a new location and [reuse training effort](addOwnTrainingData.md) from earlier, we can write our labeled data to a csv
-
-`./scripts/zingg.sh --phase exportModel --conf --location `
diff --git a/python/phases/exportModel.py b/python/phases/exportModel.py
deleted file mode 100644
index 336b1f587..000000000
--- a/python/phases/exportModel.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from zingg.client import *
-import sys
-import argparse
-import os
-
-logging.basicConfig(level=logging.INFO)
-LOG = logging.getLogger("zingg.exportModel")
-
-def main():
-
- # ckecking for mandatory option --location for this phase
- if(ClientOptions(sys.argv[1:]).hasLocation()==False):
- LOG.error("--location argument is mandatory for this phase, please specify")
- LOG.info("--location is location of CSV file for exported data")
- sys.exit()
-
- LOG.info("Phase ExportModel starts")
-
- options = ClientOptions(sys.argv[1:])
- options.setPhase("peekModel")
- arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase())
- client = Zingg(arguments, options)
- client.init()
-
- pMarkedDF = getPandasDfFromDs(client.getMarkedRecords())
- labelledData = getSparkSession().createDataFrame(pMarkedDF)
- location = options.getLocation()
-
- export_data(labelledData, location)
-
- LOG.info("Phase ExportModel ends")
-
-def export_data(labelledData, location):
-
- baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_zsource', 'z_isMatch']
- sourceDataColumns = [c for c in labelledData.columns if c not in baseCols]
- additionalTrainingColumns = ['z_cluster','z_isMatch']
- trainingSampleColumns = [*additionalTrainingColumns, *sourceDataColumns]
- trainingSamples = labelledData.select(trainingSampleColumns)
-
- # Getting schema
- trainingSamples.schema.jsonValue()
- trainingSamples.show()
- trainingSamples.columns
- print(trainingSampleColumns)
-
- # Exporting the labelled data as CSV
- trainingSamples.toPandas().to_csv(os.path.join(location,r'exportedData.csv'), index=False)
-
-
-if __name__ == "__main__":
- main()
diff --git a/python/zingg/client.py b/python/zingg/client.py
index 772127617..ce33b45fd 100644
--- a/python/zingg/client.py
+++ b/python/zingg/client.py
@@ -670,14 +670,6 @@ def setLabelDataSampleSize(self, labelDataSampleSize):
"""
self.args.setLabelDataSampleSize(labelDataSampleSize)
- def writeArgumentsToJSON(self, fileName):
- """Method to write JSON file from the object of this class
-
- :param fileName: The CONF parameter value of ClientOption object or file address of json file
- :type fileName: String
- """
- getJVM().zingg.common.client.arguments.ArgumentServiceImpl().writeArguments(fileName, self.args)
-
def setStopWordsCutoff(self, stopWordsCutoff):
"""Method to set stopWordsCutoff parameter value
By default, Zingg extracts 10% of the high frequency unique words from a dataset. If user wants different selection, they should set up StopWordsCutoff property
@@ -711,29 +703,6 @@ def createArgumentsFromJSON(fileName, phase):
obj.args = getJVM().zingg.common.client.argumentst.ArgumentServiceImpl().loadArguments(fileName)
return obj
- def writeArgumentsToJSONString(self):
- """Method to create an object of this class from the JSON file and phase parameter value.
-
- :param fileName: The CONF parameter value of ClientOption object
- :type fileName: String
- :param phase: The PHASE parameter value of ClientOption object
- :type phase: String
- :return: The pointer containing address of the this class object
- :rtype: pointer(Arguments)
- """
- jsonString = getJVM().java.lang.String()
- return getJVM().zingg.common.client.arguments.ArgumentServiceImpl().writeArguments(jsonString, self.args)
-
- @staticmethod
- def createArgumentsFromJSONString(jsonArgs, phase):
- obj = Arguments()
- obj.args = getJVM().zingg.common.client.arguments.ArgumentServiceImpl().loadArguments(jsonArgs)
- return obj
-
- def copyArgs(self, phase):
- argsString = self.writeArgumentsToJSONString()
- return self.createArgumentsFromJSONString(argsString, phase)
-
class ClientOptions:
"""Class that contains Client options for Zingg object
@@ -770,7 +739,6 @@ def __init__(self, argsSent=None):
args = argsSent.copy()
if self.PHASE not in args:
args.append(self.PHASE)
- args.append("peekModel")
if self.LICENSE not in args:
args.append(self.LICENSE)
args.append("zinggLic.txt")
@@ -922,7 +890,7 @@ def parseArguments(argv):
"""
parser = argparse.ArgumentParser(description="Zingg's python APIs")
mandatoryOptions = parser.add_argument_group("mandatory arguments")
- mandatoryOptions.add_argument("--phase", required=True, help="python phase e.g. assessModel")
+ mandatoryOptions.add_argument("--phase", required=True)
mandatoryOptions.add_argument(
"--conf",
required=True,
diff --git a/spark/client/src/test/java/zingg/spark/client/TestArguments.java b/spark/client/src/test/java/zingg/spark/client/TestArguments.java
index a3b130fac..19aeb848a 100644
--- a/spark/client/src/test/java/zingg/spark/client/TestArguments.java
+++ b/spark/client/src/test/java/zingg/spark/client/TestArguments.java
@@ -1,28 +1,27 @@
package zingg.spark.client;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import java.rmi.NoSuchObjectException;
-import java.util.Arrays;
-import java.util.List;
-
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.jupiter.api.Test;
-
+import zingg.common.client.FieldDefinition;
+import zingg.common.client.IMatchType;
+import zingg.common.client.MatchTypes;
+import zingg.common.client.ZinggClientException;
import zingg.common.client.arguments.ArgumentServiceImpl;
import zingg.common.client.arguments.IArgumentService;
import zingg.common.client.arguments.loader.LoaderFactory;
import zingg.common.client.arguments.model.Arguments;
-import zingg.common.client.FieldDefinition;
import zingg.common.client.arguments.model.IArguments;
-import zingg.common.client.IMatchType;
-import zingg.common.client.MatchTypes;
-import zingg.common.client.ZinggClientException;
import zingg.common.client.arguments.writer.WriterFactory;
import zingg.common.client.pipe.Pipe;
import zingg.spark.client.pipe.SparkPipe;
+import java.rmi.NoSuchObjectException;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
public class TestArguments {
public static final Log LOG = LogFactory.getLog(TestArguments.class);
@@ -62,7 +61,7 @@ public void testWriteArgumentObjectToJSONFile() throws ZinggClientException, NoS
args.setBlockSize(400L);
args.setCollectMetrics(true);
args.setModelId("500");
- argumentService.loadArguments("/tmp/configFromArgObject.json");
+ argumentService.writeArguments("/tmp/configFromArgObject.json", args);
//reload the same config file to check if deserialization is successful
IArguments newArgs = argumentService.loadArguments("/tmp/configFromArgObject.json");
diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPythonPhaseRunner.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPythonPhaseRunner.java
deleted file mode 100644
index d31de7957..000000000
--- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPythonPhaseRunner.java
+++ /dev/null
@@ -1,72 +0,0 @@
-package zingg.spark.core.executor;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.spark.deploy.PythonRunner;
-import org.apache.spark.sql.Column;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.types.DataType;
-import org.apache.spark.sql.SparkSession;
-
-import zingg.common.client.ClientOptions;
-import zingg.common.client.arguments.model.IZArgs;
-import zingg.common.client.ZinggClientException;
-import zingg.common.client.options.ZinggOptions;
-
-import zingg.common.core.executor.ZinggBase;
-import zingg.spark.core.context.ZinggSparkContext;
-
-
-public class SparkPythonPhaseRunner extends ZinggBase, Row, Column, DataType>{
-
- private static final long serialVersionUID = 1L;
- protected static String name = "zingg.spark.core.executor.SparkPythonPhaseRunner";
- public static final Log LOG = LogFactory.getLog(SparkPythonPhaseRunner.class);
-
- public SparkPythonPhaseRunner() {
- setZinggOption(ZinggOptions.PEEK_MODEL);
- setContext(new ZinggSparkContext());
-
- }
-
- @Override
- public void init(IZArgs args, SparkSession s, ClientOptions options)
- throws ZinggClientException {
- super.init(args,s,options);
- getContext().setUtils();
- //we wil not init here as we wnt py to drive
- //the spark session etc
- getContext().init(s);
- }
-
- @Override
- public void execute() throws ZinggClientException {
- try {
- //closing session here
- //as pyspark will further create it
- //TODO getOrCreate not working in pyspark
- SparkSession sparkSession = context.getSession();
- sparkSession.stop();
- LOG.info("Generic Python phase starts");
- //LOG.info(this.getClass().getClassLoader().getResource("python/phases/assessModel.py").getFile());
- List pyArgs = new ArrayList();
- String phase = clientOptions.get(ClientOptions.PHASE).getValue();
- pyArgs.add("python/phases/" + phase + ".py");
- pyArgs.add("");
- for (String c: clientOptions.getCommandLineArgs()) {
- pyArgs.add(c);
- }
- PythonRunner.main(pyArgs.toArray(new String[pyArgs.size()]));
-
- LOG.info("Generic Python phase ends");
- } catch (Exception exception) {
- throw new ZinggClientException("Error occurred while executing python phase, ", exception);
- }
- }
-
-
-}
diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java
index 26827cc9c..d190264d0 100644
--- a/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java
+++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java
@@ -24,7 +24,6 @@ public SparkZFactory() {}
zinggers.put(ZinggOptions.UPDATE_LABEL, SparkLabelUpdater.name);
zinggers.put(ZinggOptions.FIND_AND_LABEL, SparkFindAndLabeller.name);
zinggers.put(ZinggOptions.RECOMMEND, SparkRecommender.name);
- zinggers.put(ZinggOptions.PEEK_MODEL, SparkPythonPhaseRunner.name);
}
public IZingg get(ZinggOption z) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
diff --git a/spark/core/src/test/java/zingg/spark/core/executor/TestPeekModel.java b/spark/core/src/test/java/zingg/spark/core/executor/TestPeekModel.java
deleted file mode 100644
index 1c1e9e652..000000000
--- a/spark/core/src/test/java/zingg/spark/core/executor/TestPeekModel.java
+++ /dev/null
@@ -1,46 +0,0 @@
-package zingg.spark.core.executor;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-/**end to end integration test*/
-public class TestPeekModel {
- public static final Log LOG = LogFactory.getLog(TestPeekModel.class);
-
- /*
- InMemoryPipe outputPipe;
-
- @BeforeEach
- public void setUp() throws Exception, ZinggClientException{
- args = Arguments.createArgumentsFromJSON(getClass().getResource("/testPeekModel/config.json").getFile());
- args.setZinggDir(getClass().getResource("/testFebrl/models").getPath());
- Pipe dataPipe = args.getData()[0];
- dataPipe.setProp(FilePipe.LOCATION, getClass().getResource("/testPeekModel/test.csv").getPath());
- args.setData(new Pipe[]{dataPipe});
- outputPipe = new InMemoryPipe(dataPipe);
- args.setOutput(new Pipe[]{outputPipe});
- }
-
-
- @Test
- public void testOutput(){
- PeekModel pm = new PeekModel();
- try {
- pm.init(args, "abc");
- pm.setSpark(spark);
- pm.setArgs(args);
- pm.setClientOptions(new ClientOptions("--phase", "assessModel", "--conf", "testPeekModel/config.json", "--license", "licText.txt"));
- pm.execute();
-
- Dataset dfm = pm.getMarkedRecords();
- assertEquals(80,dfm.count());
-
-
- } catch (ZinggClientException e) {
- // TODO Auto-generated catch block
- fail("did not expect " + e);
-
- }
-
- }
- */
-}
diff --git a/spark/core/src/test/resources/testPeekModel/config.json b/spark/core/src/test/resources/testPeekModel/config.json
deleted file mode 100644
index 4b1547936..000000000
--- a/spark/core/src/test/resources/testPeekModel/config.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
- "fieldDefinition":[
- {
- "fieldName" : "id",
- "matchType" : "dont_use",
- "fields" : "fname",
- "dataType": "string"
- },
- {
- "fieldName" : "fname",
- "matchType" : "fuzzy",
- "fields" : "fname",
- "dataType": "string"
- },
- {
- "fieldName" : "lname",
- "matchType" : "fuzzy",
- "fields" : "lname",
- "dataType": "string"
- },
- {
- "fieldName" : "stNo",
- "matchType": "exact",
- "fields" : "stNo",
- "dataType": "string"
- },
- {
- "fieldName" : "add1",
- "matchType": "fuzzy",
- "fields" : "add1",
- "dataType": "string"
- },
- {
- "fieldName" : "add2",
- "matchType": "fuzzy",
- "fields" : "add2",
- "dataType": "string"
- },
- {
- "fieldName" : "city",
- "matchType": "fuzzy",
- "fields" : "city",
- "dataType": "string"
- },
- {
- "fieldName" : "areacode",
- "matchType": "exact",
- "fields" : "areacode",
- "dataType": "string"
- },
- {
- "fieldName" : "state",
- "matchType": "exact",
- "fields" : "state",
- "dataType": "string"
- },
- {
- "fieldName" : "dob",
- "matchType": "exact",
- "fields" : "dob",
- "dataType": "string"
- },
- {
- "fieldName" : "ssn",
- "matchType": "exact",
- "fields" : "ssn",
- "dataType": "string"
- }
- ],
- "output" : [{
- "name":"output",
- "format":"csv",
- "props": {
- "path": "/tmp/testPeekModel/zinggOutput",
- "delimiter": ",",
- "header":true
- }
- }],
- "data" : [{
- "name":"test",
- "format":"csv",
- "props": {
- "path": "./testPeekModel/test.csv",
- "delimiter": ",",
- "header":false
- },
- "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string"
- }],
- "labelDataSampleSize" : 0.5,
- "numPartitions":4,
- "modelId": 100,
- "zinggDir": "./testFebrl/models"
-
-}
diff --git a/spark/core/src/test/resources/testPeekModel/test.csv b/spark/core/src/test/resources/testPeekModel/test.csv
deleted file mode 100644
index 91175cb1c..000000000
--- a/spark/core/src/test/resources/testPeekModel/test.csv
+++ /dev/null
@@ -1,65 +0,0 @@
-rec-1020-org, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765
-rec-1021-dup-0, thomas, georgze,1, mcmanus place, , north turarmurra,3130, sa,19630225,5460534
-rec-1021-org, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534
-rec-1022-dup-1, jackson, eglinron,840, mountview, fowles treet, burlei gh heads,2803, sa,19830807,2932837
-rec-1022-dup-2, jackson, eglinton,840, fowles street, moun tvjiew, burleigh heads,2830, ss, ,2932837
-rec-1022-dup-3, jackson, christo,840, fowles street, mou ntveiw, burleig heads,2830, sa,19830807,2932837
-rec-1022-dup-4, jackson, eglinton,840, fowles street, mountv iew, burleigh heads,2830, sa,19830807,2932837
-rec-1022-org, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837
-rec-1023-org, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080
-rec-1024-org, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362
-rec-1025-org, emiily, britten,8, kitchener street, hilltop hostel rowethorpe, lake heights,2463, qld,19491021,9588775
-rec-1026-dup-0, xani, green, , phill ip avenue, , armidale,5108, nsw,19390410,9201057
-rec-1026-dup-1, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201857
-rec-1026-org, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201057
-rec-1027-org, nathan, smallacombe,20, guthridge crescent, red cross units, sandy bay,6056, sa,19241223,7522263
-rec-1028-dup-0, , ,24, , woorinyan, riverwood,3749, qld,19180205,9341716
-rec-1028-dup-1, , eglinton,24, curriecrescent, woorinyan, riverwood,3749, qld,19180205,1909717
-rec-1028-org, , eglinton,24, currie crescent, woorinyan, riverwood,3749, qld,19180205,9341716
-rec-1029-dup-0, kylee, stepehndon,81, rose scott circuit, cordobak anor, ashfield,4226, vic,19461101,4783085
-rec-1029-dup-1, sachin, stephenson,81, rose scott circuit, cordoba manor, ashfi eld,4226, vic,19461101,4783085
-rec-1029-dup-2, annalise, stephenson,81, rose scott circuit, cordoba manor, ashfoeld,4226, vic,19461101,4783085
-rec-1029-dup-3, kykee, turale,81, rose scott circuit, , ashfield,4226, vic,19461101,4783085
-rec-1029-dup-4, kylee, stephenson,81, cordoba manor, rose scott circuit, ashfield,4226, vic,19461101,4783085
-rec-1029-org, kylee, stephenson,81, rose scott circuit, cordoba manor, ashfield,4226, vic,19461101,4783085
-rec-103-dup-0, benjamin, koerbin,15, wybel anah, violet grover place, mill park,2446, nsw,19210210,3808808
-rec-103-org, briony, koerbin,146, violet grover place, wybelanah, mill park,2446, nsw,19210210,3808808
-rec-1030-org, emma, crossman,53, mcdowall place, kellhaven, tara,5608, vic,19391027,3561186
-rec-1031-org, samantha, sabieray,68, quandong street, wattle brae, gorokan,4019, wa,19590807,2863290
-rec-1032-dup-0, brooklyn, naar-cafentas,210, duffy street, tourist psrk, berwick,2481, nsw, ,3624304
-rec-1032-org, brooklyn, naar-cafentas,210, duffy street, tourist park, berwick,2481, nsw,19840802,3624304
-rec-1033-dup-0, keziah, painter,18, ainsli e avenue, sec 1, torquay,3205, vic,19191031,7801066
-rec-1033-org, keziah, painter,18, ainslie avenue, sec 1, torquay,3205, vic,19191031,7801066
-rec-1034-dup-0, erin, maynard,24, , wariala, little river,2777, vic,19970430,7429462
-rec-1034-dup-1, erin, maynard,51, wilshire street, warialda, little irver,2777, vic,19970430,1815999
-rec-1034-dup-2, hayley, maynard,14, wilshire street, , little river,2777, vic,19970430,7429462
-rec-1034-org, erin, maynard,14, wilshire street, warialda, little river,2777, vic,19970430,7429462
-rec-1035-dup-0, jaiden, rollins,48, tulgeywood, rossarden street, balwyn north,2224, nt,19280722,7626396
-rec-1035-dup-1, jaiden, rollins,95, rossarden street, tulgewyood, balwyn north,2224, nt,19280722,7626396
-rec-1035-dup-2, jaiden, rolilns,48, swinden street, tulgeywood, balwyn north,2224, nt,19280722,7626396
-rec-1035-dup-3, jaiden, rolli ns,48, tulgeywomod, rossarden street, balwyn north,2224, nf,19280722,7626396
-rec-1035-org, jaiden, rollins,48, rossarden street, tulgeywood, balwyn north,2224, nt,19280722,7626396
-rec-1036-dup-0, , held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651
-rec-1036-dup-1, sarsha, held,42, lampard circuit, , golden bay,2447, vic,19510806,3710651
-rec-1036-org, amber, held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651
-rec-1037-org, connor, beckwith,10, heard street, , mill park,5031, nsw,19081103,2209091
-rec-1038-org, danny, campbell,95, totterdell street, moama, shellharbour,2209, vic,19951105,9554924
-rec-1039-dup-0, angus, roas,62, gormansto crescent, mlc centre, kiruwah,3350, sa,19250817,2655081
-rec-1039-org, angus, rosa,62, gormanston crescent, mlc centre, kirwan,3350, sa,19250817,2655081
-rec-104-dup-0, benjaminl, carbone,18, arthella, wattle s treet, orange,3550, vic,19050820,3677127
-rec-104-org, benjamin, carbone,18, wattle street, arthella, orange,3550, vic,19050820,3677127
-rec-1040-dup-0, matilda, mestrov, , housecicuit, retirement village, taringa,3820, qld,19801119,2536135
-rec-1040-dup-1, matilda, mestrv,5, house circuit, retirement village, taringa,3802, qld,19801119,2563135
-rec-1040-dup-2, matilda, mestrov,5, house circuit, retiremen tvillage, taringa,3820, ,19801119,2563135
-rec-1040-org, matilda, mestrov,5, house circuit, retirement village, taringa,3820, qld,19801119,2563135
-rec-1041-dup-0, tyler, frojd, , burramurra avenue, kmart p plaza, san rmeo,3670, sa,19800916,7812219
-rec-1041-org, tyler, froud,8, burramurra avenue, kmart p plaza, san remo,3670, sa,19800916,7812219
-rec-1042-dup-0, kiandra, ,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205
-rec-1042-dup-1, kiandra, cowle,2, gatliff place, rustenubr g sth, girgarre,3995, qld,19801125,3328205
-rec-1042-org, kiandra, cowle,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205
-rec-1043-org, giorgia, frahn,62, handasyde street, ramano estate locn 1, tallebudgera,4506, vic,19670206,9724789
-rec-1044-dup-0, nicole, shadbolt,46, schlich s treet, simpson army barracks, toowoomba,3000, wa,19030926,8190756
-rec-1044-dup-1, nicole, carbone,46, schlich nstreet, simpson army barracks, toowoomba,3000, wa,19030926,8190756
-rec-1044-dup-2, nicole, carbone,46, schlich street, simpson arm ybarracks, toowong,3000, wa,19030926,8190756
-rec-1044-dup-3, nicole, carbone,46, schlich street, simpsonary barracks, toowoomba,3000, wa,19030926,8190756
-rec-1044-org, nicole, carbone,46, schlich street, simpson army barracks, toowoomba,3000, wa,19030926,8190756
diff --git a/test/testFebrl/testArgs.py b/test/testFebrl/testArgs.py
index 7dc2b554a..8dd6221b6 100644
--- a/test/testFebrl/testArgs.py
+++ b/test/testFebrl/testArgs.py
@@ -484,104 +484,6 @@ def test_createArgumentsFromJSON(self):
self.assertIsInstance(obj, Arguments)
- def test_writeArgumentsToJSON(self):
- json_file_name = "arguments_file.json"
-
- args.writeArgumentsToJSON(json_file_name)
-
- self.assertTrue(os.path.exists(json_file_name))
- os.remove(json_file_name)
-
- def test_writeArgumentsToJSONString(self):
- # print("new args: ", args1)
- # print("old args: ", args)
- json_string = args1.writeArgumentsToJSONString()
- # json_string1 = args.writeArgumentsToJSONString()
- print("json_string: ",json_string)
- # print("oldjson_string: ", json_string1)
- data = json.loads(json_string)
- print("data: ", data)
-
- self.assertEqual(data['modelId'], "100")
- self.assertEqual(data['zinggDir'], "models")
-
- def test_createArgumentsFromJSONString(self):
- sample_json = '''
- {
- "fieldDefinition": [
- {
- "fieldName": "recId",
- "matchType": "dont_use",
- "fields": "recId",
- "dataType": "string"
- },
- {
- "fieldName": "fname",
- "matchType": "fuzzy",
- "fields": "fname",
- "dataType": "string"
- },
- {
- "fieldName": "lname",
- "matchType": "fuzzy",
- "fields": "lname",
- "dataType": "string"
- },
- {
- "fieldName": "stNo",
- "matchType": "fuzzy",
- "fields": "stNo",
- "dataType": "string"
- },
- {
- "fieldName": "add1",
- "matchType": "fuzzy",
- "fields": "add1",
- "dataType": "string"
- }
- ],
- "output": [
- {
- "name": "output",
- "format": "csv",
- "props": {
- "path": "/tmp/zinggOutput",
- "delimiter": ",",
- "header": true
- }
- }
- ],
- "data": [
- {
- "name": "test",
- "format": "csv",
- "props": {
- "path": "examples/febrl/test.csv",
- "delimiter": ",",
- "header": false
- },
- "schema": "recId string, fname string, lname string, stNo string, add1 string"
- }
- ],
- "labelDataSampleSize": 0.5,
- "numPartitions": 4,
- "modelId": 100,
- "zinggDir": "models"
- }
- '''
- phase = "label"
-
- obj = args.createArgumentsFromJSONString(sample_json, phase)
-
- self.assertIsInstance(obj, Arguments)
- self.assertEqual(obj.getModelId(), "100")
-
- def test_copyArgs(self):
- phase = "test_phase"
- copied_args = args.copyArgs(phase)
-
- self.assertIsInstance(copied_args, Arguments)
-
class TestClientOptions(TestCase):
def setUp(self):
@@ -615,9 +517,6 @@ def test_setOptionValue(self):
else:
self.fail(f"getOptionValue raised an unexpected exception: {str(e)}")
- def test_getPhase(self):
- phase_value = self.client_options.getPhase()
- self.assertEqual(phase_value, 'peekModel')
def test_setPhase(self):
self.client_options.setPhase('new_phase')