@@ -95,3 +95,64 @@ path = "path/to/your/files"
9595files = list (Path(path).glob(" *.md" ))
9696p.run({" text_file_converter" : {" sources" : files}})
9797```
98+
99+ ### In YAML
100+
101+ This is the YAML representation of the indexing pipeline shown above. It reads text files, cleans the text, splits it into individual sentences, and writes them to an in-memory document store.
102+
103+ ``` yaml
104+ components :
105+ cleaner :
106+ init_parameters :
107+ ascii_only : false
108+ keep_id : false
109+ remove_empty_lines : true
110+ remove_extra_whitespaces : true
111+ remove_regex : null
112+ remove_repeated_substrings : false
113+ remove_substrings : null
114+ replace_regexes : null
115+ strip_whitespaces : false
116+ unicode_normalization : null
117+ type : haystack.components.preprocessors.document_cleaner.DocumentCleaner
118+ splitter :
119+ init_parameters :
120+ extend_abbreviations : true
121+ language : en
122+ respect_sentence_boundary : false
123+ skip_empty_documents : true
124+ split_by : sentence
125+ split_length : 1
126+ split_overlap : 0
127+ split_threshold : 0
128+ use_split_rules : true
129+ type : haystack.components.preprocessors.document_splitter.DocumentSplitter
130+ text_file_converter :
131+ init_parameters :
132+ encoding : utf-8
133+ store_full_path : false
134+ type : haystack.components.converters.txt.TextFileToDocument
135+ writer :
136+ init_parameters :
137+ document_store :
138+ init_parameters :
139+ bm25_algorithm : BM25L
140+ bm25_parameters : {}
141+ bm25_tokenization_regex : (?u)\\b\\w+\\b
142+ embedding_similarity_function : dot_product
143+ index : 64e4f9ab-87fb-47fd-b390-dabcfda61447
144+ return_embedding : true
145+ type : haystack.document_stores.in_memory.document_store.InMemoryDocumentStore
146+ policy : NONE
147+ type : haystack.components.writers.document_writer.DocumentWriter
148+ connection_type_validation : true
149+ connections :
150+ - receiver : cleaner.documents
151+ sender : text_file_converter.documents
152+ - receiver : splitter.documents
153+ sender : cleaner.documents
154+ - receiver : writer.documents
155+ sender : splitter.documents
156+ max_runs_per_component : 100
157+ metadata : {}
158+ ` ` `
0 commit comments