diff --git a/Dockerfile b/Dockerfile index 978bbfd52..bdfe8cfb5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,17 +4,17 @@ USER 0 RUN apt-get update && \ apt install -y curl vim ENV SPARK_MASTER local[*] -ENV ZINGG_HOME /zingg-0.5.0 +ENV ZINGG_HOME /zingg-0.6.0 ENV PATH $ZINGG_HOME/scripts:$PATH ENV LANG C.UTF-8 WORKDIR / USER root -WORKDIR /zingg-0.5.0 -RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.5.0/zingg-0.5.0-spark-3.5.0.tar.gz | \ +WORKDIR /zingg-0.6.0 +RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.6.0/zingg-0.6.0-spark-3.6.0.tar.gz | \ tar --extract --gzip --strip=1 RUN pip install -r python/requirements.txt RUN pip install zingg -RUN chmod -R a+rwx /zingg-0.5.0/models -RUN chown -R spark /zingg-0.5.0/models +RUN chmod -R a+rwx /zingg-0.6.0/models +RUN chown -R spark /zingg-0.6.0/models USER spark diff --git a/common/client/src/main/java/zingg/common/client/Client.java b/common/client/src/main/java/zingg/common/client/Client.java index 114221e0d..9faa8bd06 100644 --- a/common/client/src/main/java/zingg/common/client/Client.java +++ b/common/client/src/main/java/zingg/common/client/Client.java @@ -59,7 +59,7 @@ public Client(IZArgs args, ClientOptions options, String zFactory) throws ZinggC setOptions(options); try { buildAndSetArguments(args, options); - setZingg(args, options); + setZingg(options); } catch (Exception e) { throw new ZinggClientException("An error has occured while setting up the client", e); @@ -91,14 +91,14 @@ public IZinggFactory getZinggFactory() throws InstantiationException, IllegalAcc - public void setZingg(IZArgs args, ClientOptions options) throws Exception{ + public void setZingg(ClientOptions options) throws Exception{ IZinggFactory zf = getZinggFactory(); try{ setZingg(zf.get(ZinggOptions.getByValue(options.get(ClientOptions.PHASE).value.trim()))); } catch(Exception e) { - //set default - setZingg(zf.get(ZinggOptions.getByValue(ZinggOptions.PEEK_MODEL.getName()))); + LOG.error("Error creating zingg instance for phase " + options.get(ClientOptions.PHASE).value.trim(), e); + throw e; } } diff --git a/common/client/src/main/java/zingg/common/client/MatchTypes.java b/common/client/src/main/java/zingg/common/client/MatchTypes.java index 3edd727fe..cbd552369 100644 --- a/common/client/src/main/java/zingg/common/client/MatchTypes.java +++ b/common/client/src/main/java/zingg/common/client/MatchTypes.java @@ -42,13 +42,13 @@ public static String[] getAllMatchTypes() { return s; } - public static IMatchType getByName(String name) throws Exception{ + public static IMatchType getByName(String name) throws IllegalArgumentException{ for (IMatchType zo: MatchTypes.allMatchTypes.values()) { if (zo.getName().equalsIgnoreCase(name)) { return zo; } } - return null; + throw new IllegalArgumentException("Invalid match type: " + name); } } diff --git a/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java b/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java index 64ea1a5a5..7658ce8e1 100644 --- a/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java +++ b/common/client/src/main/java/zingg/common/client/arguments/ArgumentServiceImpl.java @@ -45,7 +45,7 @@ public A loadArguments(String path) throws ZinggClientException, NoSuchObjectExc @Override public void writeArguments(String path, IZArgs args) throws ZinggClientException, NoSuchObjectException { - ArgumentsWriter argumentsWriter = writerFactory.getArgumentsWriter(WriterType.JSON); + ArgumentsWriter argumentsWriter = writerFactory.getArgumentsWriter(WriterType.FILE); argumentsWriter.write(path, args); } diff --git a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java index cbca5647b..bc30825da 100644 --- a/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java +++ b/common/client/src/main/java/zingg/common/client/options/ZinggOptions.java @@ -1,11 +1,11 @@ package zingg.common.client.options; -import java.util.HashMap; -import java.util.Map; - import zingg.common.client.ZinggClientException; import zingg.common.client.util.Util; +import java.util.HashMap; +import java.util.Map; + public class ZinggOptions { public final static ZinggOption TRAIN = new ZinggOption("train"); @@ -18,9 +18,6 @@ public class ZinggOptions { public final static ZinggOption RECOMMEND = new ZinggOption("recommend"); public final static ZinggOption UPDATE_LABEL = new ZinggOption("updateLabel"); public final static ZinggOption FIND_AND_LABEL = new ZinggOption("findAndLabel"); - public final static ZinggOption ASSESS_MODEL = new ZinggOption("assessModel"); - public final static ZinggOption PEEK_MODEL = new ZinggOption("peekModel"); - public final static ZinggOption EXPORT_MODEL = new ZinggOption("exportModel"); public static Map allZinggOptions;// = new HashMap(); diff --git a/common/client/src/test/java/zingg/common/client/TestArguments.java b/common/client/src/test/java/zingg/common/client/TestArguments.java index f0961907b..0e26a2593 100644 --- a/common/client/src/test/java/zingg/common/client/TestArguments.java +++ b/common/client/src/test/java/zingg/common/client/TestArguments.java @@ -1,5 +1,14 @@ package zingg.common.client; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.jupiter.api.Test; +import zingg.common.client.arguments.ArgumentServiceImpl; +import zingg.common.client.arguments.IArgumentService; +import zingg.common.client.arguments.loader.template.EnvironmentVariableSubstitutor; +import zingg.common.client.arguments.model.Arguments; +import zingg.common.client.arguments.model.IArguments; + import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -9,16 +18,8 @@ import java.util.List; import java.util.Map; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.jupiter.api.Test; -import zingg.common.client.arguments.ArgumentServiceImpl; -import zingg.common.client.arguments.IArgumentService; -import zingg.common.client.arguments.model.Arguments; -import zingg.common.client.arguments.model.IArguments; -import zingg.common.client.arguments.loader.template.EnvironmentVariableSubstitutor; - -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; public class TestArguments { diff --git a/common/core/src/test/resources/testPeekModel/config.json b/common/core/src/test/resources/testPeekModel/config.json deleted file mode 100644 index 07c02b3ec..000000000 --- a/common/core/src/test/resources/testPeekModel/config.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "fieldDefinition":[ - { - "fieldName" : "id", - "matchType" : "dont_use", - "fields" : "fname", - "dataType": "string" - }, - { - "fieldName" : "fname", - "matchType" : "fuzzy", - "fields" : "fname", - "dataType": "string" - }, - { - "fieldName" : "lname", - "matchType" : "fuzzy", - "fields" : "lname", - "dataType": "string" - }, - { - "fieldName" : "stNo", - "matchType": "exact", - "fields" : "stNo", - "dataType": "string" - }, - { - "fieldName" : "add1", - "matchType": "fuzzy", - "fields" : "add1", - "dataType": "string" - }, - { - "fieldName" : "add2", - "matchType": "fuzzy", - "fields" : "add2", - "dataType": "string" - }, - { - "fieldName" : "city", - "matchType": "fuzzy", - "fields" : "city", - "dataType": "string" - }, - { - "fieldName" : "areacode", - "matchType": "exact", - "fields" : "areacode", - "dataType": "string" - }, - { - "fieldName" : "state", - "matchType": "exact", - "fields" : "state", - "dataType": "string" - }, - { - "fieldName" : "dob", - "matchType": "exact", - "fields" : "dob", - "dataType": "string" - }, - { - "fieldName" : "ssn", - "matchType": "exact", - "fields" : "ssn", - "dataType": "string" - } - ], - "output" : [{ - "name":"output", - "format":"csv", - "props": { - "path": "/tmp/testPeekModel/zinggOutput", - "delimiter": ",", - "header":true - } - }], - "data" : [{ - "name":"test", - "format":"csv", - "props": { - "path": "./testPeekModel/test.csv", - "delimiter": ",", - "header":false - }, - "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" - }], - "labelDataSampleSize" : 0.5, - "numPartitions":4, - "modelId": 100, - "zinggDir": "./testFebrl/models" - -} diff --git a/common/core/src/test/resources/testPeekModel/test.csv b/common/core/src/test/resources/testPeekModel/test.csv deleted file mode 100644 index 91175cb1c..000000000 --- a/common/core/src/test/resources/testPeekModel/test.csv +++ /dev/null @@ -1,65 +0,0 @@ -rec-1020-org, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765 -rec-1021-dup-0, thomas, georgze,1, mcmanus place, , north turarmurra,3130, sa,19630225,5460534 -rec-1021-org, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534 -rec-1022-dup-1, jackson, eglinron,840, mountview, fowles treet, burlei gh heads,2803, sa,19830807,2932837 -rec-1022-dup-2, jackson, eglinton,840, fowles street, moun tvjiew, burleigh heads,2830, ss, ,2932837 -rec-1022-dup-3, jackson, christo,840, fowles street, mou ntveiw, burleig heads,2830, sa,19830807,2932837 -rec-1022-dup-4, jackson, eglinton,840, fowles street, mountv iew, burleigh heads,2830, sa,19830807,2932837 -rec-1022-org, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837 -rec-1023-org, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080 -rec-1024-org, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362 -rec-1025-org, emiily, britten,8, kitchener street, hilltop hostel rowethorpe, lake heights,2463, qld,19491021,9588775 -rec-1026-dup-0, xani, green, , phill ip avenue, , armidale,5108, nsw,19390410,9201057 -rec-1026-dup-1, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201857 -rec-1026-org, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201057 -rec-1027-org, nathan, smallacombe,20, guthridge crescent, red cross units, sandy bay,6056, sa,19241223,7522263 -rec-1028-dup-0, , ,24, , woorinyan, riverwood,3749, qld,19180205,9341716 -rec-1028-dup-1, , eglinton,24, curriecrescent, woorinyan, riverwood,3749, qld,19180205,1909717 -rec-1028-org, , eglinton,24, currie crescent, woorinyan, riverwood,3749, qld,19180205,9341716 -rec-1029-dup-0, kylee, stepehndon,81, rose scott circuit, cordobak anor, ashfield,4226, vic,19461101,4783085 -rec-1029-dup-1, sachin, stephenson,81, rose scott circuit, cordoba manor, ashfi eld,4226, vic,19461101,4783085 -rec-1029-dup-2, annalise, stephenson,81, rose scott circuit, cordoba manor, ashfoeld,4226, vic,19461101,4783085 -rec-1029-dup-3, kykee, turale,81, rose scott circuit, , ashfield,4226, vic,19461101,4783085 -rec-1029-dup-4, kylee, stephenson,81, cordoba manor, rose scott circuit, ashfield,4226, vic,19461101,4783085 -rec-1029-org, kylee, stephenson,81, rose scott circuit, cordoba manor, ashfield,4226, vic,19461101,4783085 -rec-103-dup-0, benjamin, koerbin,15, wybel anah, violet grover place, mill park,2446, nsw,19210210,3808808 -rec-103-org, briony, koerbin,146, violet grover place, wybelanah, mill park,2446, nsw,19210210,3808808 -rec-1030-org, emma, crossman,53, mcdowall place, kellhaven, tara,5608, vic,19391027,3561186 -rec-1031-org, samantha, sabieray,68, quandong street, wattle brae, gorokan,4019, wa,19590807,2863290 -rec-1032-dup-0, brooklyn, naar-cafentas,210, duffy street, tourist psrk, berwick,2481, nsw, ,3624304 -rec-1032-org, brooklyn, naar-cafentas,210, duffy street, tourist park, berwick,2481, nsw,19840802,3624304 -rec-1033-dup-0, keziah, painter,18, ainsli e avenue, sec 1, torquay,3205, vic,19191031,7801066 -rec-1033-org, keziah, painter,18, ainslie avenue, sec 1, torquay,3205, vic,19191031,7801066 -rec-1034-dup-0, erin, maynard,24, , wariala, little river,2777, vic,19970430,7429462 -rec-1034-dup-1, erin, maynard,51, wilshire street, warialda, little irver,2777, vic,19970430,1815999 -rec-1034-dup-2, hayley, maynard,14, wilshire street, , little river,2777, vic,19970430,7429462 -rec-1034-org, erin, maynard,14, wilshire street, warialda, little river,2777, vic,19970430,7429462 -rec-1035-dup-0, jaiden, rollins,48, tulgeywood, rossarden street, balwyn north,2224, nt,19280722,7626396 -rec-1035-dup-1, jaiden, rollins,95, rossarden street, tulgewyood, balwyn north,2224, nt,19280722,7626396 -rec-1035-dup-2, jaiden, rolilns,48, swinden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 -rec-1035-dup-3, jaiden, rolli ns,48, tulgeywomod, rossarden street, balwyn north,2224, nf,19280722,7626396 -rec-1035-org, jaiden, rollins,48, rossarden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 -rec-1036-dup-0, , held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 -rec-1036-dup-1, sarsha, held,42, lampard circuit, , golden bay,2447, vic,19510806,3710651 -rec-1036-org, amber, held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 -rec-1037-org, connor, beckwith,10, heard street, , mill park,5031, nsw,19081103,2209091 -rec-1038-org, danny, campbell,95, totterdell street, moama, shellharbour,2209, vic,19951105,9554924 -rec-1039-dup-0, angus, roas,62, gormansto crescent, mlc centre, kiruwah,3350, sa,19250817,2655081 -rec-1039-org, angus, rosa,62, gormanston crescent, mlc centre, kirwan,3350, sa,19250817,2655081 -rec-104-dup-0, benjaminl, carbone,18, arthella, wattle s treet, orange,3550, vic,19050820,3677127 -rec-104-org, benjamin, carbone,18, wattle street, arthella, orange,3550, vic,19050820,3677127 -rec-1040-dup-0, matilda, mestrov, , housecicuit, retirement village, taringa,3820, qld,19801119,2536135 -rec-1040-dup-1, matilda, mestrv,5, house circuit, retirement village, taringa,3802, qld,19801119,2563135 -rec-1040-dup-2, matilda, mestrov,5, house circuit, retiremen tvillage, taringa,3820, ,19801119,2563135 -rec-1040-org, matilda, mestrov,5, house circuit, retirement village, taringa,3820, qld,19801119,2563135 -rec-1041-dup-0, tyler, frojd, , burramurra avenue, kmart p plaza, san rmeo,3670, sa,19800916,7812219 -rec-1041-org, tyler, froud,8, burramurra avenue, kmart p plaza, san remo,3670, sa,19800916,7812219 -rec-1042-dup-0, kiandra, ,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 -rec-1042-dup-1, kiandra, cowle,2, gatliff place, rustenubr g sth, girgarre,3995, qld,19801125,3328205 -rec-1042-org, kiandra, cowle,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 -rec-1043-org, giorgia, frahn,62, handasyde street, ramano estate locn 1, tallebudgera,4506, vic,19670206,9724789 -rec-1044-dup-0, nicole, shadbolt,46, schlich s treet, simpson army barracks, toowoomba,3000, wa,19030926,8190756 -rec-1044-dup-1, nicole, carbone,46, schlich nstreet, simpson army barracks, toowoomba,3000, wa,19030926,8190756 -rec-1044-dup-2, nicole, carbone,46, schlich street, simpson arm ybarracks, toowong,3000, wa,19030926,8190756 -rec-1044-dup-3, nicole, carbone,46, schlich street, simpsonary barracks, toowoomba,3000, wa,19030926,8190756 -rec-1044-org, nicole, carbone,46, schlich street, simpson army barracks, toowoomba,3000, wa,19030926,8190756 diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index b83a8b29b..8584c3ce3 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -43,7 +43,6 @@ * [Using Pre-existing Training Data](setup/training/addOwnTrainingData.md) * [Updating Labeled Pairs](updatingLabels.md) * [Documenting The Training Data](stepbystep/createtrainingdata/generatingdocumentation.md) - * [Exporting Labeled Data](setup/training/exportLabeledData.md) * [Model Difference](stepbystep/createtrainingdata/modeldiff.md) * [Ensuring Scalability](verifyBlocking.md) * [Building And Saving The Model](setup/train.md) diff --git a/docs/connectors/jdbc/clickhouse.md b/docs/connectors/jdbc/clickhouse.md new file mode 100644 index 000000000..076d84f2a --- /dev/null +++ b/docs/connectors/jdbc/clickhouse.md @@ -0,0 +1,53 @@ +# ClickHouse + +ClickHouse Pipe Definitions +JSON settings for reading and writing data using the ClickHouse JDBC driver. + +## ClickHouse Input (Reading) + +```json +"data": [ + { + "name": "clickhouse_input", + "format": "jdbc", + "props": { + "url": "jdbc:clickhouse:https://:/?ssl=true", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "user": "", + "password": "", + "dbtable": "" + } + } +] +``` + +## ClickHouse Output (Writing) + +```json +"output": [ + { + "name": "clickhouse_output", + "format": "jdbc", + "props": { + "url": "jdbc:clickhouse:https://:/?ssl=true", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "user": "", + "password": "", + "dbtable": "", + "saveMode": "append" + } + } +] +``` + +## Implementation Steps + +### Add the Driver Jar +Download the `clickhouse-jdbc-0.9.8-all.jar` and add its path to `config/zingg.conf` to ensure Spark can load the driver: + +```properties +spark.jars=/path/to/clickhouse-jdbc-0.9.8-all.jar +``` + +### Port +Use port `8443` for ClickHouse Cloud (HTTPS) or `8123` for local HTTP instances. diff --git a/docs/setup/training/addOwnTrainingData.md b/docs/setup/training/addOwnTrainingData.md index b25c1f815..21706faf4 100644 --- a/docs/setup/training/addOwnTrainingData.md +++ b/docs/setup/training/addOwnTrainingData.md @@ -17,6 +17,4 @@ Here, the first column specifies the z_cluster, the second column specifies the The above training data can be specified using [trainingSamples attribute in the configuration.](../../../examples/febrl/configWithTrainingSamples.json) -In addition, labeled data of one model can also be exported and used as training data for another model. For details, check out [exporting labeled data](exportLabeledData.md). - **Note**: It is advisable to still run [findTrainingData](findTrainingData.md) and [label](label.md) a few rounds to tune Zingg with the supplied training data as well as patterns it needs to learn independently. diff --git a/docs/setup/training/exportLabeledData.md b/docs/setup/training/exportLabeledData.md deleted file mode 100644 index aaf32faed..000000000 --- a/docs/setup/training/exportLabeledData.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -parent: Creating training data -title: Exporting labeled data as csv -grand_parent: Step By Step Guide -nav_order: 4 ---- - -# Exporting Labeled Data - -If we need to send our labeled data for a subject matter expert to review or if we want to build another model in a new location and [reuse training effort](addOwnTrainingData.md) from earlier, we can write our labeled data to a csv - -`./scripts/zingg.sh --phase exportModel --conf --location ` diff --git a/python/phases/exportModel.py b/python/phases/exportModel.py deleted file mode 100644 index 336b1f587..000000000 --- a/python/phases/exportModel.py +++ /dev/null @@ -1,52 +0,0 @@ -from zingg.client import * -import sys -import argparse -import os - -logging.basicConfig(level=logging.INFO) -LOG = logging.getLogger("zingg.exportModel") - -def main(): - - # ckecking for mandatory option --location for this phase - if(ClientOptions(sys.argv[1:]).hasLocation()==False): - LOG.error("--location argument is mandatory for this phase, please specify") - LOG.info("--location is location of CSV file for exported data") - sys.exit() - - LOG.info("Phase ExportModel starts") - - options = ClientOptions(sys.argv[1:]) - options.setPhase("peekModel") - arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase()) - client = Zingg(arguments, options) - client.init() - - pMarkedDF = getPandasDfFromDs(client.getMarkedRecords()) - labelledData = getSparkSession().createDataFrame(pMarkedDF) - location = options.getLocation() - - export_data(labelledData, location) - - LOG.info("Phase ExportModel ends") - -def export_data(labelledData, location): - - baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_zsource', 'z_isMatch'] - sourceDataColumns = [c for c in labelledData.columns if c not in baseCols] - additionalTrainingColumns = ['z_cluster','z_isMatch'] - trainingSampleColumns = [*additionalTrainingColumns, *sourceDataColumns] - trainingSamples = labelledData.select(trainingSampleColumns) - - # Getting schema - trainingSamples.schema.jsonValue() - trainingSamples.show() - trainingSamples.columns - print(trainingSampleColumns) - - # Exporting the labelled data as CSV - trainingSamples.toPandas().to_csv(os.path.join(location,r'exportedData.csv'), index=False) - - -if __name__ == "__main__": - main() diff --git a/python/zingg/client.py b/python/zingg/client.py index 772127617..ce33b45fd 100644 --- a/python/zingg/client.py +++ b/python/zingg/client.py @@ -670,14 +670,6 @@ def setLabelDataSampleSize(self, labelDataSampleSize): """ self.args.setLabelDataSampleSize(labelDataSampleSize) - def writeArgumentsToJSON(self, fileName): - """Method to write JSON file from the object of this class - - :param fileName: The CONF parameter value of ClientOption object or file address of json file - :type fileName: String - """ - getJVM().zingg.common.client.arguments.ArgumentServiceImpl().writeArguments(fileName, self.args) - def setStopWordsCutoff(self, stopWordsCutoff): """Method to set stopWordsCutoff parameter value By default, Zingg extracts 10% of the high frequency unique words from a dataset. If user wants different selection, they should set up StopWordsCutoff property @@ -711,29 +703,6 @@ def createArgumentsFromJSON(fileName, phase): obj.args = getJVM().zingg.common.client.argumentst.ArgumentServiceImpl().loadArguments(fileName) return obj - def writeArgumentsToJSONString(self): - """Method to create an object of this class from the JSON file and phase parameter value. - - :param fileName: The CONF parameter value of ClientOption object - :type fileName: String - :param phase: The PHASE parameter value of ClientOption object - :type phase: String - :return: The pointer containing address of the this class object - :rtype: pointer(Arguments) - """ - jsonString = getJVM().java.lang.String() - return getJVM().zingg.common.client.arguments.ArgumentServiceImpl().writeArguments(jsonString, self.args) - - @staticmethod - def createArgumentsFromJSONString(jsonArgs, phase): - obj = Arguments() - obj.args = getJVM().zingg.common.client.arguments.ArgumentServiceImpl().loadArguments(jsonArgs) - return obj - - def copyArgs(self, phase): - argsString = self.writeArgumentsToJSONString() - return self.createArgumentsFromJSONString(argsString, phase) - class ClientOptions: """Class that contains Client options for Zingg object @@ -770,7 +739,6 @@ def __init__(self, argsSent=None): args = argsSent.copy() if self.PHASE not in args: args.append(self.PHASE) - args.append("peekModel") if self.LICENSE not in args: args.append(self.LICENSE) args.append("zinggLic.txt") @@ -922,7 +890,7 @@ def parseArguments(argv): """ parser = argparse.ArgumentParser(description="Zingg's python APIs") mandatoryOptions = parser.add_argument_group("mandatory arguments") - mandatoryOptions.add_argument("--phase", required=True, help="python phase e.g. assessModel") + mandatoryOptions.add_argument("--phase", required=True) mandatoryOptions.add_argument( "--conf", required=True, diff --git a/spark/client/src/test/java/zingg/spark/client/TestArguments.java b/spark/client/src/test/java/zingg/spark/client/TestArguments.java index a3b130fac..19aeb848a 100644 --- a/spark/client/src/test/java/zingg/spark/client/TestArguments.java +++ b/spark/client/src/test/java/zingg/spark/client/TestArguments.java @@ -1,28 +1,27 @@ package zingg.spark.client; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.rmi.NoSuchObjectException; -import java.util.Arrays; -import java.util.List; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.jupiter.api.Test; - +import zingg.common.client.FieldDefinition; +import zingg.common.client.IMatchType; +import zingg.common.client.MatchTypes; +import zingg.common.client.ZinggClientException; import zingg.common.client.arguments.ArgumentServiceImpl; import zingg.common.client.arguments.IArgumentService; import zingg.common.client.arguments.loader.LoaderFactory; import zingg.common.client.arguments.model.Arguments; -import zingg.common.client.FieldDefinition; import zingg.common.client.arguments.model.IArguments; -import zingg.common.client.IMatchType; -import zingg.common.client.MatchTypes; -import zingg.common.client.ZinggClientException; import zingg.common.client.arguments.writer.WriterFactory; import zingg.common.client.pipe.Pipe; import zingg.spark.client.pipe.SparkPipe; +import java.rmi.NoSuchObjectException; +import java.util.Arrays; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + public class TestArguments { public static final Log LOG = LogFactory.getLog(TestArguments.class); @@ -62,7 +61,7 @@ public void testWriteArgumentObjectToJSONFile() throws ZinggClientException, NoS args.setBlockSize(400L); args.setCollectMetrics(true); args.setModelId("500"); - argumentService.loadArguments("/tmp/configFromArgObject.json"); + argumentService.writeArguments("/tmp/configFromArgObject.json", args); //reload the same config file to check if deserialization is successful IArguments newArgs = argumentService.loadArguments("/tmp/configFromArgObject.json"); diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkPythonPhaseRunner.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkPythonPhaseRunner.java deleted file mode 100644 index d31de7957..000000000 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkPythonPhaseRunner.java +++ /dev/null @@ -1,72 +0,0 @@ -package zingg.spark.core.executor; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.deploy.PythonRunner; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.SparkSession; - -import zingg.common.client.ClientOptions; -import zingg.common.client.arguments.model.IZArgs; -import zingg.common.client.ZinggClientException; -import zingg.common.client.options.ZinggOptions; - -import zingg.common.core.executor.ZinggBase; -import zingg.spark.core.context.ZinggSparkContext; - - -public class SparkPythonPhaseRunner extends ZinggBase, Row, Column, DataType>{ - - private static final long serialVersionUID = 1L; - protected static String name = "zingg.spark.core.executor.SparkPythonPhaseRunner"; - public static final Log LOG = LogFactory.getLog(SparkPythonPhaseRunner.class); - - public SparkPythonPhaseRunner() { - setZinggOption(ZinggOptions.PEEK_MODEL); - setContext(new ZinggSparkContext()); - - } - - @Override - public void init(IZArgs args, SparkSession s, ClientOptions options) - throws ZinggClientException { - super.init(args,s,options); - getContext().setUtils(); - //we wil not init here as we wnt py to drive - //the spark session etc - getContext().init(s); - } - - @Override - public void execute() throws ZinggClientException { - try { - //closing session here - //as pyspark will further create it - //TODO getOrCreate not working in pyspark - SparkSession sparkSession = context.getSession(); - sparkSession.stop(); - LOG.info("Generic Python phase starts"); - //LOG.info(this.getClass().getClassLoader().getResource("python/phases/assessModel.py").getFile()); - List pyArgs = new ArrayList(); - String phase = clientOptions.get(ClientOptions.PHASE).getValue(); - pyArgs.add("python/phases/" + phase + ".py"); - pyArgs.add(""); - for (String c: clientOptions.getCommandLineArgs()) { - pyArgs.add(c); - } - PythonRunner.main(pyArgs.toArray(new String[pyArgs.size()])); - - LOG.info("Generic Python phase ends"); - } catch (Exception exception) { - throw new ZinggClientException("Error occurred while executing python phase, ", exception); - } - } - - -} diff --git a/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java b/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java index 26827cc9c..d190264d0 100644 --- a/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java +++ b/spark/core/src/main/java/zingg/spark/core/executor/SparkZFactory.java @@ -24,7 +24,6 @@ public SparkZFactory() {} zinggers.put(ZinggOptions.UPDATE_LABEL, SparkLabelUpdater.name); zinggers.put(ZinggOptions.FIND_AND_LABEL, SparkFindAndLabeller.name); zinggers.put(ZinggOptions.RECOMMEND, SparkRecommender.name); - zinggers.put(ZinggOptions.PEEK_MODEL, SparkPythonPhaseRunner.name); } public IZingg get(ZinggOption z) throws InstantiationException, IllegalAccessException, ClassNotFoundException { diff --git a/spark/core/src/test/java/zingg/spark/core/executor/TestPeekModel.java b/spark/core/src/test/java/zingg/spark/core/executor/TestPeekModel.java deleted file mode 100644 index 1c1e9e652..000000000 --- a/spark/core/src/test/java/zingg/spark/core/executor/TestPeekModel.java +++ /dev/null @@ -1,46 +0,0 @@ -package zingg.spark.core.executor; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -/**end to end integration test*/ -public class TestPeekModel { - public static final Log LOG = LogFactory.getLog(TestPeekModel.class); - - /* - InMemoryPipe outputPipe; - - @BeforeEach - public void setUp() throws Exception, ZinggClientException{ - args = Arguments.createArgumentsFromJSON(getClass().getResource("/testPeekModel/config.json").getFile()); - args.setZinggDir(getClass().getResource("/testFebrl/models").getPath()); - Pipe dataPipe = args.getData()[0]; - dataPipe.setProp(FilePipe.LOCATION, getClass().getResource("/testPeekModel/test.csv").getPath()); - args.setData(new Pipe[]{dataPipe}); - outputPipe = new InMemoryPipe(dataPipe); - args.setOutput(new Pipe[]{outputPipe}); - } - - - @Test - public void testOutput(){ - PeekModel pm = new PeekModel(); - try { - pm.init(args, "abc"); - pm.setSpark(spark); - pm.setArgs(args); - pm.setClientOptions(new ClientOptions("--phase", "assessModel", "--conf", "testPeekModel/config.json", "--license", "licText.txt")); - pm.execute(); - - Dataset dfm = pm.getMarkedRecords(); - assertEquals(80,dfm.count()); - - - } catch (ZinggClientException e) { - // TODO Auto-generated catch block - fail("did not expect " + e); - - } - - } - */ -} diff --git a/spark/core/src/test/resources/testPeekModel/config.json b/spark/core/src/test/resources/testPeekModel/config.json deleted file mode 100644 index 4b1547936..000000000 --- a/spark/core/src/test/resources/testPeekModel/config.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "fieldDefinition":[ - { - "fieldName" : "id", - "matchType" : "dont_use", - "fields" : "fname", - "dataType": "string" - }, - { - "fieldName" : "fname", - "matchType" : "fuzzy", - "fields" : "fname", - "dataType": "string" - }, - { - "fieldName" : "lname", - "matchType" : "fuzzy", - "fields" : "lname", - "dataType": "string" - }, - { - "fieldName" : "stNo", - "matchType": "exact", - "fields" : "stNo", - "dataType": "string" - }, - { - "fieldName" : "add1", - "matchType": "fuzzy", - "fields" : "add1", - "dataType": "string" - }, - { - "fieldName" : "add2", - "matchType": "fuzzy", - "fields" : "add2", - "dataType": "string" - }, - { - "fieldName" : "city", - "matchType": "fuzzy", - "fields" : "city", - "dataType": "string" - }, - { - "fieldName" : "areacode", - "matchType": "exact", - "fields" : "areacode", - "dataType": "string" - }, - { - "fieldName" : "state", - "matchType": "exact", - "fields" : "state", - "dataType": "string" - }, - { - "fieldName" : "dob", - "matchType": "exact", - "fields" : "dob", - "dataType": "string" - }, - { - "fieldName" : "ssn", - "matchType": "exact", - "fields" : "ssn", - "dataType": "string" - } - ], - "output" : [{ - "name":"output", - "format":"csv", - "props": { - "path": "/tmp/testPeekModel/zinggOutput", - "delimiter": ",", - "header":true - } - }], - "data" : [{ - "name":"test", - "format":"csv", - "props": { - "path": "./testPeekModel/test.csv", - "delimiter": ",", - "header":false - }, - "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, areacode string, state string, dob string, ssn string" - }], - "labelDataSampleSize" : 0.5, - "numPartitions":4, - "modelId": 100, - "zinggDir": "./testFebrl/models" - -} diff --git a/spark/core/src/test/resources/testPeekModel/test.csv b/spark/core/src/test/resources/testPeekModel/test.csv deleted file mode 100644 index 91175cb1c..000000000 --- a/spark/core/src/test/resources/testPeekModel/test.csv +++ /dev/null @@ -1,65 +0,0 @@ -rec-1020-org, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765 -rec-1021-dup-0, thomas, georgze,1, mcmanus place, , north turarmurra,3130, sa,19630225,5460534 -rec-1021-org, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534 -rec-1022-dup-1, jackson, eglinron,840, mountview, fowles treet, burlei gh heads,2803, sa,19830807,2932837 -rec-1022-dup-2, jackson, eglinton,840, fowles street, moun tvjiew, burleigh heads,2830, ss, ,2932837 -rec-1022-dup-3, jackson, christo,840, fowles street, mou ntveiw, burleig heads,2830, sa,19830807,2932837 -rec-1022-dup-4, jackson, eglinton,840, fowles street, mountv iew, burleigh heads,2830, sa,19830807,2932837 -rec-1022-org, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837 -rec-1023-org, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080 -rec-1024-org, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362 -rec-1025-org, emiily, britten,8, kitchener street, hilltop hostel rowethorpe, lake heights,2463, qld,19491021,9588775 -rec-1026-dup-0, xani, green, , phill ip avenue, , armidale,5108, nsw,19390410,9201057 -rec-1026-dup-1, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201857 -rec-1026-org, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201057 -rec-1027-org, nathan, smallacombe,20, guthridge crescent, red cross units, sandy bay,6056, sa,19241223,7522263 -rec-1028-dup-0, , ,24, , woorinyan, riverwood,3749, qld,19180205,9341716 -rec-1028-dup-1, , eglinton,24, curriecrescent, woorinyan, riverwood,3749, qld,19180205,1909717 -rec-1028-org, , eglinton,24, currie crescent, woorinyan, riverwood,3749, qld,19180205,9341716 -rec-1029-dup-0, kylee, stepehndon,81, rose scott circuit, cordobak anor, ashfield,4226, vic,19461101,4783085 -rec-1029-dup-1, sachin, stephenson,81, rose scott circuit, cordoba manor, ashfi eld,4226, vic,19461101,4783085 -rec-1029-dup-2, annalise, stephenson,81, rose scott circuit, cordoba manor, ashfoeld,4226, vic,19461101,4783085 -rec-1029-dup-3, kykee, turale,81, rose scott circuit, , ashfield,4226, vic,19461101,4783085 -rec-1029-dup-4, kylee, stephenson,81, cordoba manor, rose scott circuit, ashfield,4226, vic,19461101,4783085 -rec-1029-org, kylee, stephenson,81, rose scott circuit, cordoba manor, ashfield,4226, vic,19461101,4783085 -rec-103-dup-0, benjamin, koerbin,15, wybel anah, violet grover place, mill park,2446, nsw,19210210,3808808 -rec-103-org, briony, koerbin,146, violet grover place, wybelanah, mill park,2446, nsw,19210210,3808808 -rec-1030-org, emma, crossman,53, mcdowall place, kellhaven, tara,5608, vic,19391027,3561186 -rec-1031-org, samantha, sabieray,68, quandong street, wattle brae, gorokan,4019, wa,19590807,2863290 -rec-1032-dup-0, brooklyn, naar-cafentas,210, duffy street, tourist psrk, berwick,2481, nsw, ,3624304 -rec-1032-org, brooklyn, naar-cafentas,210, duffy street, tourist park, berwick,2481, nsw,19840802,3624304 -rec-1033-dup-0, keziah, painter,18, ainsli e avenue, sec 1, torquay,3205, vic,19191031,7801066 -rec-1033-org, keziah, painter,18, ainslie avenue, sec 1, torquay,3205, vic,19191031,7801066 -rec-1034-dup-0, erin, maynard,24, , wariala, little river,2777, vic,19970430,7429462 -rec-1034-dup-1, erin, maynard,51, wilshire street, warialda, little irver,2777, vic,19970430,1815999 -rec-1034-dup-2, hayley, maynard,14, wilshire street, , little river,2777, vic,19970430,7429462 -rec-1034-org, erin, maynard,14, wilshire street, warialda, little river,2777, vic,19970430,7429462 -rec-1035-dup-0, jaiden, rollins,48, tulgeywood, rossarden street, balwyn north,2224, nt,19280722,7626396 -rec-1035-dup-1, jaiden, rollins,95, rossarden street, tulgewyood, balwyn north,2224, nt,19280722,7626396 -rec-1035-dup-2, jaiden, rolilns,48, swinden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 -rec-1035-dup-3, jaiden, rolli ns,48, tulgeywomod, rossarden street, balwyn north,2224, nf,19280722,7626396 -rec-1035-org, jaiden, rollins,48, rossarden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 -rec-1036-dup-0, , held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 -rec-1036-dup-1, sarsha, held,42, lampard circuit, , golden bay,2447, vic,19510806,3710651 -rec-1036-org, amber, held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 -rec-1037-org, connor, beckwith,10, heard street, , mill park,5031, nsw,19081103,2209091 -rec-1038-org, danny, campbell,95, totterdell street, moama, shellharbour,2209, vic,19951105,9554924 -rec-1039-dup-0, angus, roas,62, gormansto crescent, mlc centre, kiruwah,3350, sa,19250817,2655081 -rec-1039-org, angus, rosa,62, gormanston crescent, mlc centre, kirwan,3350, sa,19250817,2655081 -rec-104-dup-0, benjaminl, carbone,18, arthella, wattle s treet, orange,3550, vic,19050820,3677127 -rec-104-org, benjamin, carbone,18, wattle street, arthella, orange,3550, vic,19050820,3677127 -rec-1040-dup-0, matilda, mestrov, , housecicuit, retirement village, taringa,3820, qld,19801119,2536135 -rec-1040-dup-1, matilda, mestrv,5, house circuit, retirement village, taringa,3802, qld,19801119,2563135 -rec-1040-dup-2, matilda, mestrov,5, house circuit, retiremen tvillage, taringa,3820, ,19801119,2563135 -rec-1040-org, matilda, mestrov,5, house circuit, retirement village, taringa,3820, qld,19801119,2563135 -rec-1041-dup-0, tyler, frojd, , burramurra avenue, kmart p plaza, san rmeo,3670, sa,19800916,7812219 -rec-1041-org, tyler, froud,8, burramurra avenue, kmart p plaza, san remo,3670, sa,19800916,7812219 -rec-1042-dup-0, kiandra, ,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 -rec-1042-dup-1, kiandra, cowle,2, gatliff place, rustenubr g sth, girgarre,3995, qld,19801125,3328205 -rec-1042-org, kiandra, cowle,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 -rec-1043-org, giorgia, frahn,62, handasyde street, ramano estate locn 1, tallebudgera,4506, vic,19670206,9724789 -rec-1044-dup-0, nicole, shadbolt,46, schlich s treet, simpson army barracks, toowoomba,3000, wa,19030926,8190756 -rec-1044-dup-1, nicole, carbone,46, schlich nstreet, simpson army barracks, toowoomba,3000, wa,19030926,8190756 -rec-1044-dup-2, nicole, carbone,46, schlich street, simpson arm ybarracks, toowong,3000, wa,19030926,8190756 -rec-1044-dup-3, nicole, carbone,46, schlich street, simpsonary barracks, toowoomba,3000, wa,19030926,8190756 -rec-1044-org, nicole, carbone,46, schlich street, simpson army barracks, toowoomba,3000, wa,19030926,8190756 diff --git a/test/testFebrl/testArgs.py b/test/testFebrl/testArgs.py index 7dc2b554a..8dd6221b6 100644 --- a/test/testFebrl/testArgs.py +++ b/test/testFebrl/testArgs.py @@ -484,104 +484,6 @@ def test_createArgumentsFromJSON(self): self.assertIsInstance(obj, Arguments) - def test_writeArgumentsToJSON(self): - json_file_name = "arguments_file.json" - - args.writeArgumentsToJSON(json_file_name) - - self.assertTrue(os.path.exists(json_file_name)) - os.remove(json_file_name) - - def test_writeArgumentsToJSONString(self): - # print("new args: ", args1) - # print("old args: ", args) - json_string = args1.writeArgumentsToJSONString() - # json_string1 = args.writeArgumentsToJSONString() - print("json_string: ",json_string) - # print("oldjson_string: ", json_string1) - data = json.loads(json_string) - print("data: ", data) - - self.assertEqual(data['modelId'], "100") - self.assertEqual(data['zinggDir'], "models") - - def test_createArgumentsFromJSONString(self): - sample_json = ''' - { - "fieldDefinition": [ - { - "fieldName": "recId", - "matchType": "dont_use", - "fields": "recId", - "dataType": "string" - }, - { - "fieldName": "fname", - "matchType": "fuzzy", - "fields": "fname", - "dataType": "string" - }, - { - "fieldName": "lname", - "matchType": "fuzzy", - "fields": "lname", - "dataType": "string" - }, - { - "fieldName": "stNo", - "matchType": "fuzzy", - "fields": "stNo", - "dataType": "string" - }, - { - "fieldName": "add1", - "matchType": "fuzzy", - "fields": "add1", - "dataType": "string" - } - ], - "output": [ - { - "name": "output", - "format": "csv", - "props": { - "path": "/tmp/zinggOutput", - "delimiter": ",", - "header": true - } - } - ], - "data": [ - { - "name": "test", - "format": "csv", - "props": { - "path": "examples/febrl/test.csv", - "delimiter": ",", - "header": false - }, - "schema": "recId string, fname string, lname string, stNo string, add1 string" - } - ], - "labelDataSampleSize": 0.5, - "numPartitions": 4, - "modelId": 100, - "zinggDir": "models" - } - ''' - phase = "label" - - obj = args.createArgumentsFromJSONString(sample_json, phase) - - self.assertIsInstance(obj, Arguments) - self.assertEqual(obj.getModelId(), "100") - - def test_copyArgs(self): - phase = "test_phase" - copied_args = args.copyArgs(phase) - - self.assertIsInstance(copied_args, Arguments) - class TestClientOptions(TestCase): def setUp(self): @@ -615,9 +517,6 @@ def test_setOptionValue(self): else: self.fail(f"getOptionValue raised an unexpected exception: {str(e)}") - def test_getPhase(self): - phase_value = self.client_options.getPhase() - self.assertEqual(phase_value, 'peekModel') def test_setPhase(self): self.client_options.setPhase('new_phase')