11from typing import Optional , cast
22
33from steamship import Block , DocTag , File , Steamship , Tag
4- from steamship .data import TagValueKey
4+ from steamship .data import TagKind , TagValueKey
55from steamship .data .plugin .index_plugin_instance import EmbeddingIndexPluginInstance , SearchResults
66from steamship .invocable import post
77from steamship .invocable .package_mixin import PackageMixin
88from steamship .utils .file_tags import update_file_status
9+ from steamship .utils .text_chunker import chunk_text
910
1011DEFAULT_EMBEDDING_INDEX_CONFIG = {
1112 "embedder" : {
@@ -71,13 +72,17 @@ def _get_index(self, index_handle: Optional[str] = None) -> EmbeddingIndexPlugin
7172 def index_text (
7273 self , text : str , metadata : Optional [dict ] = None , index_handle : Optional [str ] = None
7374 ) -> bool :
75+ """Load text into an embedding index.
76+
77+ Optional arguments:
78+ - index_handle (uses your default index if blank)
79+ - metadata (returned on embedding results for source attribution)
80+ """
7481 tags = []
75- for i in range (0 , len (text ), self .context_window_size ):
76- # Calculate the extent of the window plus the overlap at the edges
77- min_range = max (0 , i - self .context_window_overlap )
78- max_range = i + self .context_window_size + self .context_window_overlap
79- chunk = text [min_range :max_range ]
80- tags .append (Tag (text = chunk , metadata = metadata ))
82+ for chunk in chunk_text (
83+ text , chunk_size = self .context_window_size , chunk_overlap = self .context_window_overlap
84+ ):
85+ tags .append (Tag (text = chunk , value = metadata ))
8186 self ._get_index (index_handle ).insert (tags )
8287 return True
8388
@@ -88,9 +93,9 @@ def _index_block(
8893 _metadata = {}
8994 if metadata :
9095 _metadata .update (metadata )
96+
9197 _metadata .update (
9298 {
93- "source" : "" ,
9499 "file_id" : block .file_id ,
95100 "block_id" : block .id ,
96101 "page" : page_id ,
@@ -103,13 +108,18 @@ def _index_block(
103108 def index_block (
104109 self , block_id : str , metadata : Optional [dict ] = None , index_handle : Optional [str ] = None
105110 ):
111+ """Load a Steamship Block into an embedding index.
112+
113+ Optional arguments:
114+ - index_handle (uses your default index if blank)
115+ - metadata (returned on embedding results for source attribution)
116+ """
106117 block = Block .get (self .client , _id = block_id )
107118 page_id = self ._get_page (block )
108119 _metadata = {}
109120 _metadata .update (metadata )
110121 _metadata .update (
111122 {
112- "source" : "" ,
113123 "file_id" : block .file_id ,
114124 "block_id" : block .id ,
115125 "page" : page_id ,
@@ -122,11 +132,29 @@ def index_block(
122132 def index_file (
123133 self , file_id : str , metadata : Optional [dict ] = None , index_handle : Optional [str ] = None
124134 ) -> bool :
135+ """Load a Steamship File into an embedding index.
136+
137+ Optional arguments:
138+ - index_handle (uses your default index if blank)
139+ - metadata (returned on embedding results for source attribution)
140+ """
125141 file = File .get (self .client , _id = file_id )
126142 update_file_status (self .client , file , "Indexing" )
127143
144+ _metadata = {}
145+ if file .mime_type :
146+ _metadata ["mime_type" ] = file .mime_type
147+
148+ for tag in file .tags or []:
149+ if tag .kind == TagKind .DOCUMENT and tag .name == DocTag .TITLE :
150+ if title := tag .value .get (TagValueKey .STRING_VALUE ):
151+ _metadata ["title" ] = title
152+
153+ if metadata :
154+ _metadata .update (metadata )
155+
128156 for block in file .blocks or []:
129- self ._index_block (block , metadata = metadata , index_handle = index_handle )
157+ self ._index_block (block , metadata = _metadata , index_handle = index_handle )
130158
131159 update_file_status (self .client , file , "Indexed" )
132160 return True
@@ -135,6 +163,11 @@ def index_file(
135163 def search_index (
136164 self , query : str , index_handle : Optional [str ] = None , k : int = 5
137165 ) -> SearchResults :
166+ """Search an embedding index.
167+
168+ Optional arguments:
169+ - index_handle (uses your default index if blank)
170+ """
138171 index = self ._get_index (index_handle )
139172 task = index .search (query , k )
140173 return task .wait ()
0 commit comments