context_chat_backend/config.gpu.yaml at master · nextcloud/context_chat_backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
debug: true
uvicorn_log_level: info
disable_aaa: false
verify_ssl: true
use_colors: true
uvicorn_workers: 1
embedding_chunk_size: 2000
doc_parser_worker_limit: 10


vectordb:
  pgvector:
    # all options: https://python.langchain.com/api_reference/postgres/vectorstores/langchain_postgres.vectorstores.PGVector.html
    # 'connection' overrides the env var 'CCB_DB_URL'

embedding:
  # embedding service config
  # for external embedding service, set CC_EM_BASE_URL and CC_EM_APIKEY env vars during deployment
  # if the env vars are set, this config is ignored
  # request_timeout is always respected even for remote service
  base_url: http://localhost:5000/v1
  workers: 1
  request_timeout: 1800 # in seconds
  # batch_size: 100 # max texts per embedding API request, 0 = no batching
  # only for external embedding service
  # remote_service: true
  # model_name: text-embedding-3-small
  # auth:
  #   apikey: your_api_key_here
  #   # -or-
  #   username: your_username_here
  #   password: your_password_here
  llama:
    # all options: https://python.langchain.com/api_reference/community/embeddings/langchain_community.embeddings.llamacpp.LlamaCppEmbeddings.html
    # 'model_alias' is reserved
    # 'embedding' is always set to True
    model: multilingual-e5-large-instruct-q6_k.gguf
    n_batch: 16
    n_ctx: 8192
    n_gpu_layers: -1


llm:
  nc_texttotext:

  llama:
    # all options: https://python.langchain.com/api_reference/community/llms/langchain_community.llms.llamacpp.LlamaCpp.html
    model_path: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
    n_batch: 512
    n_ctx: 8192
    max_tokens: 4096
    template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly.  Detect the language of the question and make sure to use the same language that was used in the question to answer the question. Don't mention which language was used, but just answer the question directly in the same langauge. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n"
    no_ctx_template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant.<|im_end|>\n<|im_start|> user\n{question}<|im_end|>\n<|im_start|> assistant\n"
    end_separator: "<|im_end|>"
    n_gpu_layers: -1
    model_kwargs:
      device: cuda

  ctransformer:
    # all options: https://python.langchain.com/api_reference/community/llms/langchain_community.llms.ctransformers.CTransformers.html
    model: dolphin-2.2.1-mistral-7b.Q5_K_M.gguf
    template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant, good at finding relevant context from documents to answer questions provided by the user. <|im_end|>\n<|im_start|> user\nUse the following documents as context to answer the question at the end. REMEMBER to excersice source critisicm as the documents are returned by a search provider that can return unrelated documents.\n\nSTART OF CONTEXT: \n{context} \n\nEND OF CONTEXT!\n\nIf you don't know the answer or are unsure, just say that you don't know, don't try to make up an answer. Don't mention the context in your answer but rather just answer the question directly.  Detect the language of the question and make sure to use the same language that was used in the question to answer the question. Don't mention which language was used, but just answer the question directly in the same langauge. \nQuestion: {question} Let's think this step-by-step. \n<|im_end|>\n<|im_start|> assistant\n"
    no_ctx_template: "<|im_start|> system \nYou're an AI assistant named Nextcloud Assistant.<|im_end|>\n<|im_start|> user\n{question}<|im_end|>\n<|im_start|> assistant\n"
    end_separator: "<|im_end|>"
    config:
      context_length: 8192
      max_new_tokens: 4096
      local_files_only: True
      gpu_layers: -1

  hugging_face:
    # all options: https://python.langchain.com/api_reference/community/llms/langchain_community.llms.huggingface_pipeline.HuggingFacePipeline.html
    model_id: gpt2
    task: text-generation
    pipeline_kwargs:
      config:
        max_length: 200
    template: ""