GraphGen/examples/evaluate/evaluate_qa/qa_evaluation_config.yaml at 10ebc37f3897fc832fafabc5ada6b9bd6f2eaa9c · InternScience/GraphGen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
global_params:
  working_dir: cache
  graph_backend: networkx # graph database backend, support: kuzu, networkx
  kv_backend: json_kv # key-value store backend, support: rocksdb, json_kv

nodes:
  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
    op_name: read
    type: source
    dependencies: []
    params:
        input_path:
          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples

  - id: chunk_documents
    op_name: chunk
    type: map_batch
    dependencies:
      - read_files
    execution_params:
      replicas: 4
    params:
        chunk_size: 1024 # chunk size for text splitting
        chunk_overlap: 100 # chunk overlap for text splitting

  - id: build_kg
    op_name: build_kg
    type: map_batch
    dependencies:
      - chunk_documents
    execution_params:
      replicas: 1
      batch_size: 128

  - id: quiz
    op_name: quiz
    type: aggregate
    dependencies:
      - build_kg
    execution_params:
      replicas: 1
      batch_size: 128
    params:
      quiz_samples: 2 # number of quiz samples to generate
      concurrency_limit: 200

  - id: judge
    op_name: judge
    type: map_batch
    dependencies:
      - quiz
    execution_params:
      replicas: 1
      batch_size: 128

  - id: partition
    op_name: partition
    type: aggregate
    dependencies:
      - judge
    params:
      method: ece # ece is a custom partition method based on comprehension loss
      method_params:
        max_units_per_community: 20 # max nodes and edges per community
        min_units_per_community: 5 # min nodes and edges per community
        max_tokens_per_community: 10240 # max tokens per community
        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss

  - id: generate
    op_name: generate
    type: map_batch
    dependencies:
      - partition
    execution_params:
      replicas: 1
      batch_size: 128
    save_output: true
    params:
      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
      data_format: ChatML # Alpaca, Sharegpt, ChatML

  - id: evaluate
    op_name: evaluate
    type: map_batch
    dependencies:
      - generate
    execution_params:
      replicas: 1
      batch_size: 128
    save_output: true
    params:
      target: qa
      metrics:
        - length
        - mtld
        # - reward_score
        # - uni_score
      mtld_params:
        threshold: 0.7