detopt_data/extract_chan.py at master · cqsl/detopt_data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import json
import numpy as np
import camelot
import os
import hashlib


def parse_float_fix_minus(s):
    # remove spaces
    s = s.replace(' ', '')
    # fix minus which is parsed as 2
    if s[0] == '2':
        s = '-'+s[1:]
    return float(s)


def main(fname, prefix=''):
    tables = camelot.read_pdf(fname, flavor='stream', pages='6')
    df = tables[-1].df

    data_chan = {}

    i, = np.where((df[0] == ''))
    i = i.tolist()[-1]
    data_chan['r'] = list(map(lambda s: float(s.strip('a 0')), df.iloc[i][1:]))

    i, = np.where((df[0] == 'UHF'))
    i, = i.tolist()
    data_chan['uhf'] = list(map(parse_float_fix_minus, df.iloc[i][1:]))

    i, = np.where((df[0] == 'DMRG: 4000'))
    i, = i.tolist()
    data_chan['dmrg4000_offset'] = list(map(parse_float_fix_minus, df.iloc[i][1:]))

    i, = np.where((df[0] == 'CCSD'))
    i, = i.tolist()
    data_chan['ccsd_offset'] = list(map(parse_float_fix_minus, df.iloc[i][1:]))

    i, = np.where((df[0] == 'CCSD~T!'))
    i, = i.tolist()
    data_chan['ccsd_t_offset'] = list(map(parse_float_fix_minus, df.iloc[i][1:]))

    with open(prefix+'e_chan.json', 'w') as f:
        json.dump(data_chan, f)


if __name__ == '__main__':
    fname = '6110_1_online.pdf'
    # the file is generated with a different cover page every time, so cant check the hash
    md5 = None

    url = 'https://doi.org/10.1063/1.1783212'
    prefix = './reference/'

    if not os.path.exists(prefix):
        os.makedirs(prefix)

    if not os.path.isfile(fname):
        print('file not found, please download', fname)
        print('e.g. from ', url)
        exit()

    if md5 is not None:
        assert hashlib.md5(open(fname,'rb').read()).hexdigest() == md5

    main(fname, prefix=prefix)