Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions mindsdb_sdk/knowledge_bases.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import copy
import json
from typing import Union, List
from typing import Union, List, Iterable

import pandas as pd

Expand All @@ -16,6 +16,19 @@
from .query import Query
from .databases import Database

MAX_INSERT_SIZE = 1000


def split_data(data: Union[pd.DataFrame, list], partition_size: int) -> Iterable:
"""
Split data into chunks with partition_size and yield them out
"""
num = 0
while num * partition_size < len(data):
# create results with partition
yield data[num * partition_size: (num + 1) * partition_size]
num += 1
Comment on lines +22 to +30
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correctness: The split_data function doesn't handle empty datasets, which could lead to an infinite loop if an empty DataFrame or list is passed to it.

📝 Committable Code Suggestion

‼️ Ensure you review the code suggestion before committing it to the branch. Make sure it replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
def split_data(data: Union[pd.DataFrame, list], partition_size: int) -> Iterable:
"""
Split data into chunks with partition_size and yield them out
"""
num = 0
while num * partition_size < len(data):
# create results with partition
yield data[num * partition_size: (num + 1) * partition_size]
num += 1
def split_data(data: Union[pd.DataFrame, list], partition_size: int) -> Iterable:
"""
Split data into chunks with partition_size and yield them out
"""
if len(data) == 0:
return
num = 0
while num * partition_size < len(data):
# create results with partition
yield data[num * partition_size: (num + 1) * partition_size]
num += 1

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AI is wrong here because if data is empty then len(data) is 0 and the loop body won't be executed



class KnowledgeBase(Query):
"""
Expand Down Expand Up @@ -152,7 +165,7 @@ def insert_webpages(self, urls: List[str], crawl_depth: int = 1,
data=data
)

def insert(self, data: Union[pd.DataFrame, Query, dict], params: dict = None):
def insert(self, data: Union[pd.DataFrame, Query, dict, list], params: dict = None):
"""
Insert data to knowledge base

Expand All @@ -176,9 +189,18 @@ def insert(self, data: Union[pd.DataFrame, Query, dict], params: dict = None):
if isinstance(data, dict):
data = [data]
elif isinstance(data, pd.DataFrame):
data = data.to_dict('records')
else:
raise ValueError("Unknown data type, accepted types: DataFrame, Query, dict")
for df in split_data(data, MAX_INSERT_SIZE):
data = df.to_dict('records')
self.insert(data, params=params)
return
elif not isinstance(data, list):
raise ValueError("Unknown data type, accepted types: DataFrame, Query, dict, list")

# chunking a big input data
if len(data) > MAX_INSERT_SIZE:
for chunk in split_data(data, MAX_INSERT_SIZE):
self.insert(chunk, params=params)
return

data = {'rows': data}
if params:
Expand Down