Skip to content

Commit 7da4399

Browse files
Merge pull request #132 from patrickfleith/bug/125-invalid-json-qa-entries
Bug/125 invalid json qa entries
2 parents 2c324bb + f4fcb3b commit 7da4399

28 files changed

Lines changed: 627 additions & 107 deletions

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ pip install datafast
4646

4747
### 1. Environment Setup
4848

49-
Make sure you have created a `secrets.env` file with your API keys.
49+
Make sure you have created a `.env` file with your API keys.
5050
HF token is needed if you want to push the dataset to your HF hub.
5151
Other keys depends on which LLM providers you use.
5252
```
@@ -64,7 +64,7 @@ from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider
6464
from dotenv import load_dotenv
6565

6666
# Load environment variables
67-
load_dotenv("secrets.env") # <--- your API keys
67+
load_dotenv() # <--- your API keys
6868
```
6969

7070
### 3. Configure Dataset

datafast/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Datafast - A Python package for synthetic text dataset generation"""
22

33
import importlib.metadata
4+
from datafast.logger_config import configure_logger
45

56
try:
67
__version__ = importlib.metadata.version("datafast")
@@ -11,3 +12,5 @@
1112
def get_version():
1213
"""Return the current version of the datafast package."""
1314
return __version__
15+
16+
__all__ = ["configure_logger", "get_version"]

datafast/datasets.py

Lines changed: 252 additions & 30 deletions
Large diffs are not rendered by default.

datafast/examples/classification_trail_conditions_example.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
from datafast.datasets import ClassificationDataset
22
from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig
33
from datafast.llms import OpenAIProvider, AnthropicProvider
4+
from datafast.logger_config import configure_logger
45
from dotenv import load_dotenv
56

67
# Load API keys
7-
load_dotenv("secrets.env")
8+
load_dotenv()
9+
10+
# Configure logger
11+
configure_logger()
812

913
# Configure dataset
1014
config = ClassificationDatasetConfig(

datafast/examples/generic_pipeline_example.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from datafast.schema.config import GenericPipelineDatasetConfig
88
from datafast.datasets import GenericPipelineDataset
99
from datafast.llms import OpenAIProvider, GeminiProvider, OllamaProvider
10+
from datafast.logger_config import configure_logger
11+
from dotenv import load_dotenv
1012

1113

1214
PROMPT_TEMPLATE = """I will give you a persona.
@@ -80,7 +82,6 @@ def main():
8082

8183

8284
if __name__ == "__main__":
83-
from dotenv import load_dotenv
84-
85-
load_dotenv("secrets.env")
85+
load_dotenv()
86+
configure_logger()
8687
main()

datafast/examples/generic_pipeline_response_format_example.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
from datafast.schema.config import GenericPipelineDatasetConfig
44
from datafast.utils import create_response_model
5+
from datafast.logger_config import configure_logger
6+
7+
# Configure logger
8+
configure_logger()
59

610
# Test with multiple columns and num_samples_per_prompt = 3
711
config = GenericPipelineDatasetConfig(

datafast/examples/generic_pipeline_row_model_example.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
from datafast.schema.config import GenericPipelineDatasetConfig
44
from datafast.utils import create_generic_pipeline_row_model
5+
from datafast.logger_config import configure_logger
6+
7+
# Configure logger
8+
configure_logger()
59

610
# Test with multiple input, forward, and output columns
711
config = GenericPipelineDatasetConfig(

datafast/examples/inspect_dataset_example.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,19 @@
55
python -m datafast.examples.inspect_dataset_example
66
77
Requires:
8-
- OpenAI API key in secrets.env or environment
8+
- OpenAI API key in .env or environment
99
- gradio package (pip install gradio)
1010
"""
1111
from datafast.datasets import ClassificationDataset
1212
from datafast.schema.config import ClassificationDatasetConfig, PromptExpansionConfig
13+
from datafast.logger_config import configure_logger
1314
from dotenv import load_dotenv
1415

15-
# Load API keys from environment or secrets.env
16-
load_dotenv("secrets.env")
16+
# Load API keys from environment or .env
17+
load_dotenv()
18+
19+
# Configure logger
20+
configure_logger()
1721

1822
# Configure the dataset generation
1923
config = ClassificationDatasetConfig(

datafast/examples/keywords_extraction_example.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
from datafast.schema.config import GenericPipelineDatasetConfig
88
from datafast.datasets import GenericPipelineDataset
99
from datafast.llms import OpenAIProvider, GeminiProvider, OllamaProvider, OpenRouterProvider
10+
from datafast.logger_config import configure_logger
11+
from dotenv import load_dotenv
12+
13+
# Load environment variables and configure logger
14+
load_dotenv()
15+
configure_logger()
1016

1117
PROMPT_TEMPLATE = """I will give you a tweet.
1218
Generate a comma separated list of 3 keywords for the tweet. Avoid multi-word keywords.
@@ -63,7 +69,4 @@ def main():
6369

6470

6571
if __name__ == "__main__":
66-
from dotenv import load_dotenv
67-
68-
load_dotenv("secrets.env")
6972
main()

datafast/examples/mcq_contextual_example.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,15 @@
1010
import pandas as pd
1111
from dotenv import load_dotenv
1212

13-
load_dotenv("secrets.env")
13+
load_dotenv()
1414

1515
from datafast.schema.config import MCQDatasetConfig
1616
from datafast.datasets import MCQDataset
1717
from datafast.llms import OpenAIProvider
18+
from datafast.logger_config import configure_logger
19+
20+
# Configure logger
21+
configure_logger()
1822

1923
def main():
2024
# 1. Create a temporary filtered version of the dataset

0 commit comments

Comments
 (0)