Skip to content

Commit 75bf270

Browse files
authored
Merge pull request #27 from dw/compat
python-cdb compatability module
2 parents 936d51e + 436ebdd commit 75bf270

5 files changed

Lines changed: 542 additions & 0 deletions

File tree

README.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ constant databases that don't have the usual 4 GiB restriction.
1616

1717
This package works with Python 3.4 and above.
1818
For a version that works with Python 2, see `this older release <https://github.com/dw/python-pure-cdb/releases/tag/v2.2.0>`_.
19+
To aid in porting `cdb` applications to Python 3, this library provides a
20+
compatability module for the `python-cdb <https://github.com/acg/python-cdb>`_
21+
package, which can act as a drop-in replacement (see `the docs <https://python-pure-cdb.readthedocs.io>`_).
1922

2023
For more information on constant databases, see `djb's page <https://cr.yp.to/cdb.html>`_
2124
and `Wikipedia <https://en.wikipedia.org/wiki/Cdb_(software)>`_.

cdblib/compat.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
from itertools import chain, cycle, islice, repeat
2+
from mmap import mmap, ACCESS_READ
3+
from os import rename
4+
from os.path import getsize
5+
6+
from .cdblib import Reader, Writer
7+
8+
9+
class error(IOError):
10+
pass
11+
12+
13+
class cdbmake:
14+
def __init__(self, cdb, tmp, encoding='utf-8'):
15+
"""Create a new database to be stored at the path given by
16+
*cdb*. Records will be written to the file at the path given by
17+
*tmp*. After the ``finish()`` method is called, the file at *cdb*
18+
will be replaced by the one at *tmp*.
19+
If *encoding* is given, ``str`` keys and values will be converted
20+
to ``bytes`` with the given encoding. If *encoding* is ``None``, only
21+
``bytes`` keys and values are accepted.
22+
"""
23+
self.fn = cdb
24+
self.fntmp = tmp
25+
self.encoding = encoding
26+
27+
self._temp_obj = open(self.fntmp, 'wb')
28+
self._writer = Writer(self._temp_obj, strict=True)
29+
self.numentries = 0
30+
self._finished = False
31+
32+
def _cleanup(self):
33+
try:
34+
self._temp_obj.close()
35+
except Exception:
36+
pass
37+
38+
def __del__(self):
39+
self._cleanup()
40+
41+
def add(self, key, data):
42+
"""Store a record in the database.
43+
"""
44+
if self._finished:
45+
raise error('cdbmake object already finished')
46+
47+
args = []
48+
for arg in (key, data):
49+
if isinstance(arg, bytes):
50+
args.append(arg)
51+
elif isinstance(arg, str) and self.encoding:
52+
args.append(arg.encode(self.encoding))
53+
else:
54+
raise TypeError(
55+
'add method only accepts bytes and str objects'
56+
)
57+
58+
self._writer.put(*args)
59+
self.numentries += 1
60+
61+
def addmany(self, items):
62+
"""Store each of the records in *items* in the the database.
63+
*items* should be an iterable of ``(key, value)`` pairs.
64+
"""
65+
for key, value in items:
66+
self.add(key, value)
67+
68+
@property
69+
def fd(self):
70+
return self._temp_obj.fileno()
71+
72+
def finish(self):
73+
"""Finalize the database being written to. Then move the temporary
74+
database to its final location.
75+
"""
76+
if self._finished:
77+
return
78+
79+
self._writer.finalize()
80+
self._temp_obj.close()
81+
rename(self.fntmp, self.fn)
82+
self._finished = True
83+
84+
85+
class cdb:
86+
def __init__(self, f, encoding='utf-8'):
87+
self._file_path = f
88+
89+
self.encoding = encoding
90+
strict = not bool(encoding)
91+
92+
self._file_obj = open(self._file_path, mode='rb')
93+
self._mmap_obj = mmap(self._file_obj.fileno(), 0, access=ACCESS_READ)
94+
self._reader = Reader(self._mmap_obj, strict=strict)
95+
96+
self._keys = self._get_key_iterator()
97+
self._items = cycle(chain(self._decoded_items(), [None]))
98+
99+
def _cleanup(self):
100+
for f in (self._mmap_obj, self._file_obj):
101+
try:
102+
f.close()
103+
except Exception:
104+
pass
105+
106+
def __del__(self):
107+
self._cleanup()
108+
109+
def _unique_keys(self):
110+
all_keys = (k for k, v in self._decoded_items())
111+
seen = set()
112+
seen_add = seen.add
113+
for k in all_keys:
114+
if k not in seen:
115+
seen_add(k)
116+
yield k
117+
118+
def _decoded_items(self):
119+
for pair in self._reader.iteritems():
120+
if not self.encoding:
121+
yield pair
122+
else:
123+
decoded_pair = []
124+
for e in pair:
125+
try:
126+
e = e.decode(self.encoding)
127+
except UnicodeDecodeError:
128+
pass
129+
decoded_pair.append(e)
130+
131+
yield tuple(decoded_pair)
132+
133+
def _get_key_iterator(self):
134+
unique_keys = self._unique_keys()
135+
return cycle(chain(unique_keys, repeat(None)))
136+
137+
def each(self):
138+
"""Return successive ``(key, value)`` tuples from the database.
139+
After the last record is returned, the next call will return ``None``.
140+
The call after that will return the first record again.
141+
"""
142+
return next(self._items)
143+
144+
@property
145+
def fd(self):
146+
return self._file_obj.fileno()
147+
148+
def firstkey(self):
149+
"""Return the first key in the database.
150+
If ``nextkey()`` is called after ``firstkey()``, the second key will
151+
returned.
152+
"""
153+
self._keys = self._get_key_iterator()
154+
return next(self._keys)
155+
156+
def get(self, k, i=0):
157+
"""Return the ``i``-th value stored under the key given by ``k``.
158+
If there are fewer than ``i`` items stored under key ``k``, return
159+
``None``.
160+
"""
161+
value = next(islice(self._reader.gets(k), i, i + 1), None)
162+
if not self.encoding:
163+
return value
164+
165+
try:
166+
return value.decode(self.encoding)
167+
except (AttributeError, UnicodeDecodeError):
168+
return value
169+
170+
def __getitem__(self, key):
171+
value = self.get(key)
172+
if value is None:
173+
raise KeyError(key)
174+
175+
return value
176+
177+
def getall(self, k):
178+
"""Return a list of the values stored under key ``k``.
179+
"""
180+
ret = []
181+
ret_append = ret.append
182+
for value in self._reader.gets(k):
183+
try:
184+
value = value.decode(self.encoding)
185+
except (AttributeError, UnicodeDecodeError, TypeError):
186+
value = value
187+
ret_append(value)
188+
189+
return ret
190+
191+
def keys(self):
192+
"""Return a list of the distinct keys stored in the database.
193+
"""
194+
unique_keys = self._unique_keys()
195+
return list(unique_keys)
196+
197+
@property
198+
def name(self):
199+
return self._file_path
200+
201+
def nextkey(self):
202+
"""Return the next key in the datbase, or ``None`` if there are no more
203+
keys to retrieve. Call ``firstkey()`` to start from the beginning
204+
again.
205+
"""
206+
207+
return next(self._keys)
208+
209+
@property
210+
def size(self):
211+
return getsize(self._file_path)
212+
213+
214+
def init(f, encoding='utf-8'):
215+
"""Return a ``cdb`` object based on the database stored at the file path
216+
given by *f*.
217+
If *encoding* is given, retrieved keys and values will be decoded using
218+
the given encoding (if possible).
219+
"""
220+
return cdb(f, encoding=encoding)

docs/compat.rst

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
python-cdb compatibility module
2+
===============================
3+
4+
`cdblib.compat` is designed to be used as a drop-in replacement for
5+
`python-cdb <https://github.com/acg/python-cdb>`_, a Python 2-only module for
6+
interacting with constant databases.
7+
8+
To use it in your Python 3 application:
9+
10+
.. code-block:: python
11+
12+
import cdblib.compat as cdb # replaces import cdb
13+
14+
15+
Reading existing databases
16+
--------------------------
17+
18+
The `init()` function accepts a path to an existing database file. It
19+
returns a `cdb` object that can be used to retrieve records from it.
20+
21+
>>> db = cdb.init('info.cdb')
22+
23+
The `.each()` method returns successive `(key, value)` pairs from the database.
24+
After the last record is returned the next call will return `None`.
25+
The call after that will return the first record again.
26+
27+
>>> db.each()
28+
('a', 'value_a1')
29+
>>> db.each()
30+
('a', 'value_a2')
31+
>>> db.each()
32+
('b', 'value_b1')
33+
>>> db.each() # No more records
34+
>>> db.each() # Loop around to the first record
35+
('a', 'value_a1')
36+
37+
The `.keys()` method returns a list of distinct keys from the database.
38+
39+
>>> db.keys()
40+
['a', 'b']
41+
42+
The `cdb` object keeps an iterator over the distinct keys of the database.
43+
The `.firstkey()` method resets the iterator and returns the first stored key.
44+
`.nextkey()` advances the iterator and returns the next key.
45+
After exhausting the iterator, `None` will be returned until `.firstkey()` is
46+
called again.
47+
48+
>>> db.firstkey()
49+
'a'
50+
>>> db.nextkey()
51+
'b'
52+
>>> db.nextkey() # No more keys
53+
>>> db.firstkey() # Reset the iterator
54+
'a'
55+
56+
Call the `.get()` method with a key `k` and an optional index `i` to retrieve
57+
the `i`-th value stored under `k`. If there is no such value, `.get()` returnes
58+
`None`.
59+
60+
>>> db.get('a')
61+
'value_a1'
62+
>>> db.get('a', 1)
63+
'value_a2'
64+
>>> db.get('a', 3) # Returns None
65+
66+
The `cdb` object can be accessed like a `dict` to retrieve the first value
67+
stored under a key. If there is no such key in the database, `KeyError` is
68+
raised.
69+
70+
>>> db['a']
71+
'value_a1'
72+
>>> db['b']
73+
'value_b1'
74+
75+
Call the `.getall()` method to retrieve a list of the values stored under the
76+
key `k`.
77+
78+
>>> db.getall('a')
79+
['value_a1', 'value_a2']
80+
>>> db.getall('b')
81+
['value_b1']
82+
>>> db.getall('c') # No such key, returns empty list
83+
[]
84+
85+
The `cdb` object has a `size` property, which returns the total size of the
86+
database (in bytes). It also has a `name` property, which returns the path
87+
to the database file.
88+
89+
90+
Writing new databases
91+
---------------------
92+
93+
The `cdbmake()` class is used to create a new database. Call it with two
94+
file paths: (1) the ultimate location of the database,
95+
(2) a temporary location to use when creating the database.
96+
97+
>>> cdb_path = '/tmp/info.cdb'
98+
>>> tmp_path = cdb_path + '.tmp'
99+
>>> db = cdbmake(cdb_path, tmp_path)
100+
101+
Add records to the database with the `.add()` or `.addmany()` methods.
102+
103+
>>> db.add('b', 'value_b1')
104+
>>> db.addmany([('a', 'value_a1'), ('a', 'value_a2')])
105+
106+
Write the database structure to disk and rename the temporary file to the
107+
ultimate file with the `.finish()` method.
108+
109+
110+
Notes on encoding
111+
-----------------
112+
113+
Since `python-cdb` is a Python 2-only module, it does not distinguish between
114+
text and binary keys or values.
115+
116+
In order to handle `str` keys and values, `cdblib.compat` encodes text data
117+
on the way into the database:
118+
119+
>>> new_db.add('text_key', b'\x80 binary data') # Key is encoded to binary
120+
>>> new_db.add(b'\x80 binary key', 'text_data') # Value is encoded to binary
121+
122+
It also decodes text data when reading:
123+
124+
>>> existing_db.get(b'\x80 binary key') # Text value is decoded
125+
'text_data'
126+
>>> existing_db.get('text_key') # Binary value is left alone
127+
b'\x80 binary data'
128+
129+
`utf-8` encoding is used by default in `cdblib.compat.init()` and `cdblib.compat.cdbmake()`.
130+
Pass a different encoding with the `encoding` keyword argument.
131+
132+
Turn off automatic encoding or decoding by supplying `encoding=None`.
133+
All keys and values will be assumed to be `bytes` objects.
134+
135+
>>> existing_db = cdblib.compat.init(cdb_path, encoding=None)
136+
>>> new_db = cdblib.compat.make(cdb_path, tmp_path, encoding=None)
137+
138+
139+
Other notes
140+
-----------
141+
142+
The `python-cdb` package accepts integer file descriptors as well as file paths
143+
in `init()` and `cdbmake()`. This module does not.
144+
145+
The `cdb` objects (returned by the `init()` function) and the `cdbmake` objects
146+
close their open file objects when they are garbage collected.
147+
You may call the `._cleanup()` method on either one to close the objects
148+
yourself (this method is not avaialble when using the `python-cdb` package).
149+
150+
The `cdb` object returned by the `init()` function uses `mmap.mmap` to avoid
151+
reading the whole database file into memory.
152+
This may be inappropriate when reading database files from certain locations,
153+
such as network drives.
154+
See the `Python docs <https://docs.python.org/3/library/mmap.html>`_ for more
155+
information on `mmap`.

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Contents
88

99
quickstart.rst
1010
library.rst
11+
compat.rst
1112
cli.rst
1213
versions.rst
1314
development.rst

0 commit comments

Comments
 (0)