auto-archiver/example.config.yaml at main · robbertdam/auto-archiver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
---
secrets:
  # needed if you use storage=s3
  s3:
    # contains S3 info on region, bucket, key and secret
    region: reg1
    bucket: my-bucket
    key: "s3 API key"
    secret: "s3 API secret"
    # use region format like such
    endpoint_url: "https://{region}.digitaloceanspaces.com"
    # endpoint_url: "https://s3.{region}.amazonaws.com"
    #use bucket, region, and key (key is the archived file path generated when executing) format like such as:
    cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
    # if private:true S3 urls will not be readable online
    private: false
    # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
    key_path: random

  # needed if you use storage=gd
  google_drive:
    # To authenticate with google you have two options (1. service account OR 2. OAuth token)

    # 1. service account - storage space will count towards the developer account
    # filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
    # service_account: "service_account.json"

    # 2. OAuth token  - storage space will count towards the owner of the GDrive folder
    # (only 1. or 2. - if both specified then this 2. takes precedence)
    # needs write access on the server so refresh flow works
    # To get the token, run the file `create_update_test_oauth_token.py`
    # you can edit that file if you want a different token filename, default is "gd-token.json"
    oauth_token_filename: "gd-token.json"

    root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX

  # needed if you use storage=local
  local:
    # local path to save files in
    save_to: "./local_archive"

  wayback:
    # to get credentials visit https://archive.org/account/s3.php
    key: your API key
    secret: your API secret

  telegram:
    # to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
    api_id: your API key, see
    api_hash: your API hash
    # optional, but allows access to more content such as large videos, talk to @botfather
    bot_token: your bot-token

  # twitter configuration - API V2 only
  # if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead
  twitter:
    # either bearer_token only
    bearer_token: ""
    # OR all of the below
    consumer_key: ""
    consumer_secret: ""
    access_token: ""
    access_secret: ""

  # vkontakte (vk.com) credentials
  vk:
    username: "phone number or email"
    password: "password"

  google_sheets:
    # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
    service_account: "service_account.json"

  facebook:
    # optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
    cookie: ""
execution:
  # can be overwritten with CMD --sheet=
  sheet: your-sheet-name

  # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
  # worksheet_allow and worksheet_block can be single values or lists
  # if worksheet_allow is specified, worksheet_block is ignored
  # worksheet_allow:
  #   - Sheet1
  #   - "Sheet 2"
  # worksheet_block: BlockedSheet

  # which row of your tabs contains the header, can be overwritten with CMD --header=
  header: 1
  # which storage to use, can be overwritten with CMD --storage=
  storage: s3
  # defaults to false, when true will try to avoid duplicate URL archives
  check_if_exists: true

  # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
  # hash_algorithm: SHA-256

  # optional configurations for the selenium browser that takes screenshots, these are the defaults
  selenium:
    # values under 10s might mean screenshots fail to grab screenshot
    timeout_seconds: 120
    window_width: 1400
    window_height: 2000

  # optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
  # browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
  browsertrix:
    enabled: true # defaults to false
    profile: "./browsertrix/crawls/profile.tar.gz"
    timeout_seconds: 120 # defaults to 90s
  # puts execution logs into /logs folder, defaults to false
  save_logs: true
  # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
  # url and status are the only columns required to be present in the google sheet
  column_names:
    url: link
    status: archive status
    archive: archive location
    # use this column to override default location data
    folder: folder
    date: archive date
    thumbnail: thumbnail
    thumbnail_index: thumbnail index
    timestamp: upload timestamp
    title: upload title
    duration: duration
    screenshot: screenshot
    hash: hash
    wacz: wacz
    # if you want the replaypage to work, make sure to allow CORS on your bucket
    replaywebpage: replaywebpage