Coverage for src/derivepassphrase/__init__.py: 100.000%
134 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-14 11:39 +0200
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-14 11:39 +0200
1# SPDX-FileCopyrightText: 2024 Marco Ricci <m@the13thletter.info>
2#
3# SPDX-License-Identifier: MIT
5"""Work-alike of vault(1) – a deterministic, stateless password manager
7"""
9from __future__ import annotations
11import base64
12import collections
13import hashlib
14import math
15import unicodedata
17from typing_extensions import assert_type
19import sequin
20import ssh_agent_client
22__author__ = "Marco Ricci <m@the13thletter.info>"
23__version__ = "0.1.0"
25class AmbiguousByteRepresentationError(ValueError):
26 """The object has an ambiguous byte representation."""
28class Vault:
29 """A work-alike of James Coglan's vault.
31 Store settings for generating (actually: deriving) passphrases for
32 named services, with various constraints, given only a master
33 passphrase. Also, actually generate the passphrase. The derivation
34 is deterministic and non-secret; only the master passphrase need be
35 kept secret. The implementation is compatible with [vault][].
37 [James Coglan explains the passphrase derivation algorithm in great
38 detail][ALGORITHM] in his blog post on said topic: A principally
39 infinite bit stream is obtained by running a key-derivation function
40 on the master passphrase and the service name, then this bit stream
41 is fed into a [Sequin][sequin.Sequin] to generate random numbers in
42 the correct range, and finally these random numbers select
43 passphrase characters until the desired length is reached.
45 [vault]: https://getvau.lt
46 [ALGORITHM]: https://blog.jcoglan.com/2012/07/16/designing-vaults-generator-algorithm/
48 """
49 _UUID = b'e87eb0f4-34cb-46b9-93ad-766c5ab063e7'
50 """A tag used by vault in the bit stream generation."""
51 _CHARSETS: collections.OrderedDict[str, bytes]
52 """
53 Known character sets from which to draw passphrase characters.
54 Relies on a certain, fixed order for their definition and their
55 contents.
57 """
58 _CHARSETS = collections.OrderedDict([
59 ('lower', b'abcdefghijklmnopqrstuvwxyz'),
60 ('upper', b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
61 ('alpha', b''), # Placeholder.
62 ('number', b'0123456789'),
63 ('alphanum', b''), # Placeholder.
64 ('space', b' '),
65 ('dash', b'-_'),
66 ('symbol', b'!"#$%&\'()*+,./:;<=>?@[\\]^{|}~-_'),
67 ('all', b''), # Placeholder.
68 ])
69 _CHARSETS['alpha'] = _CHARSETS['lower'] + _CHARSETS['upper']
70 _CHARSETS['alphanum'] = _CHARSETS['alpha'] + _CHARSETS['number']
71 _CHARSETS['all'] = (_CHARSETS['alphanum'] + _CHARSETS['space']
72 + _CHARSETS['symbol'])
74 def __init__(
75 self, *, phrase: bytes | bytearray | str = b'',
76 length: int = 20, repeat: int = 0, lower: int | None = None,
77 upper: int | None = None, number: int | None = None,
78 space: int | None = None, dash: int | None = None,
79 symbol: int | None = None,
80 ) -> None:
81 """Initialize the Vault object.
83 Args:
84 phrase:
85 The master passphrase from which to derive the service
86 passphrases. If a text string, then the byte
87 representation must be unique.
88 length:
89 Desired passphrase length.
90 repeat:
91 The maximum number of immediate character repetitions
92 allowed in the passphrase. Disabled if set to 0.
93 lower:
94 Optional constraint on ASCII lowercase characters. If
95 positive, include this many lowercase characters
96 somewhere in the passphrase. If 0, avoid lowercase
97 characters altogether.
98 upper:
99 Same as `lower`, but for ASCII uppercase characters.
100 number:
101 Same as `lower`, but for ASCII digits.
102 space:
103 Same as `lower`, but for the space character.
104 dash:
105 Same as `lower`, but for the hyphen-minus and underscore
106 characters.
107 symbol:
108 Same as `lower`, but for all other hitherto unlisted
109 ASCII printable characters (except backquote).
111 Raises:
112 AmbiguousByteRepresentationError:
113 The phrase is a text string with differing NFC- and
114 NFD-normalized UTF-8 byte representations.
116 """
117 self._phrase = self._get_binary_string(phrase)
118 self._length = length
119 self._repeat = repeat
120 self._allowed = bytearray(self._CHARSETS['all'])
121 self._required: list[bytes] = []
122 def subtract_or_require(
123 count: int | None, characters: bytes | bytearray
124 ) -> None:
125 if not isinstance(count, int):
126 return
127 elif count <= 0:
128 self._allowed = self._subtract(characters, self._allowed)
129 else:
130 for _ in range(count):
131 self._required.append(characters)
132 subtract_or_require(lower, self._CHARSETS['lower'])
133 subtract_or_require(upper, self._CHARSETS['upper'])
134 subtract_or_require(number, self._CHARSETS['number'])
135 subtract_or_require(space, self._CHARSETS['space'])
136 subtract_or_require(dash, self._CHARSETS['dash'])
137 subtract_or_require(symbol, self._CHARSETS['symbol'])
138 if len(self._required) > self._length:
139 raise ValueError('requested passphrase length too short')
140 if not self._allowed:
141 raise ValueError('no allowed characters left')
142 for _ in range(len(self._required), self._length):
143 self._required.append(bytes(self._allowed))
145 def _entropy(self) -> float:
146 """Estimate the passphrase entropy, given the current settings.
148 The entropy is the base 2 logarithm of the amount of
149 possibilities. We operate directly on the logarithms, and use
150 sorting and [`math.fsum`][] to keep high accuracy.
152 Note:
153 We actually overestimate the entropy here because of poor
154 handling of character repetitions. In the extreme, assuming
155 that only one character were allowed, then because there is
156 only one possible string of each given length, the entropy
157 of that string `s` is always be zero. However, we calculate
158 the entropy as `math.log2(math.factorial(len(s)))`, i.e. we
159 assume the characters at the respective string position are
160 distinguishable from each other.
162 Returns:
163 A valid (and somewhat close) upper bound to the entropy.
165 """
166 factors: list[int] = []
167 if not self._required or any(not x for x in self._required):
168 return float('-inf')
169 for i, charset in enumerate(self._required):
170 factors.append(i + 1)
171 factors.append(len(charset))
172 factors.sort()
173 return math.fsum(math.log2(f) for f in factors)
175 def _estimate_sufficient_hash_length(
176 self, safety_factor: float = 2.0,
177 ) -> int:
178 """Estimate the sufficient hash length, given the current settings.
180 Using the entropy (via `_entropy`) and a safety factor, give an
181 initial estimate of the length to use for `create_hash` such
182 that using a `Sequin` with this hash will not exhaust it during
183 passphrase generation.
185 Args:
186 safety_factor: The safety factor. Must be at least 1.
188 Returns:
189 The estimated sufficient hash length.
191 Warning:
192 This is a heuristic, not an exact computation; it may
193 underestimate the true necessary hash length. It is
194 intended as a starting point for searching for a sufficient
195 hash length, usually by doubling the hash length each time
196 it does not yet prove so.
198 """
199 try:
200 safety_factor = float(safety_factor)
201 except TypeError as e:
202 raise TypeError(f'invalid safety factor: not a float: '
203 f'{safety_factor!r}') from e
204 if not math.isfinite(safety_factor) or safety_factor < 1.0:
205 raise ValueError(f'invalid safety factor {safety_factor!r}')
206 # Ensure the bound is strictly positive.
207 entropy_bound = max(1, self._entropy())
208 return int(math.ceil(safety_factor * entropy_bound / 8))
210 @staticmethod
211 def _get_binary_string(s: bytes | bytearray | str, /) -> bytes:
212 """Convert the input string to a read-only, binary string.
214 If it is a text string, then test for an unambiguous UTF-8
215 representation, otherwise abort. (That is, check whether the
216 NFC and NFD forms of the string coincide.)
218 Args:
219 s: The string to (check and) convert.
221 Returns:
222 A read-only, binary copy of the string.
224 Raises:
225 AmbiguousByteRepresentationError:
226 The text string has differing NFC- and NFD-normalized
227 UTF-8 byte representations.
229 """
230 if isinstance(s, str):
231 norm = unicodedata.normalize
232 if norm('NFC', s) != norm('NFD', s):
233 raise AmbiguousByteRepresentationError(
234 'text string has ambiguous byte representation')
235 return s.encode('UTF-8')
236 return bytes(s)
238 @classmethod
239 def create_hash(
240 cls, phrase: bytes | bytearray | str,
241 service: bytes | bytearray, *, length: int = 32,
242 ) -> bytes:
243 r"""Create a pseudorandom byte stream from phrase and service.
245 Create a pseudorandom byte stream from `phrase` and `service` by
246 feeding them into the key-derivation function PBKDF2
247 (8 iterations, using SHA-1).
249 Args:
250 phrase:
251 A master passphrase, or sometimes an SSH signature.
252 Used as the key for PBKDF2, the underlying cryptographic
253 primitive.
255 If a text string, then the byte representation must be
256 unique.
257 service:
258 A vault service name. Will be suffixed with
259 `Vault._UUID`, and then used as the salt value for
260 PBKDF2.
261 length:
262 The length of the byte stream to generate.
264 Returns:
265 A pseudorandom byte string of length `length`.
267 Raises:
268 AmbiguousByteRepresentationError:
269 The phrase is a text string with differing NFC- and
270 NFD-normalized UTF-8 byte representations.
272 Note:
273 Shorter values returned from this method (with the same key
274 and message) are prefixes of longer values returned from
275 this method. (This property is inherited from the
276 underlying PBKDF2 function.) It is thus safe (if slow) to
277 call this method with the same input with ever-increasing
278 target lengths.
280 Examples:
281 >>> # See also Vault.phrase_from_key examples.
282 >>> phrase = bytes.fromhex('''
283 ... 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39
284 ... 00 00 00 40
285 ... f0 98 19 80 6c 1a 97 d5 26 03 6e cc e3 65 8f 86
286 ... 66 07 13 19 13 09 21 33 33 f9 e4 36 53 1d af fd
287 ... 0d 08 1f ec f8 73 9b 8c 5f 55 39 16 7c 53 54 2c
288 ... 1e 52 bb 30 ed 7f 89 e2 2f 69 51 55 d8 9e a6 02
289 ... ''')
290 >>> Vault.create_hash(phrase, b'some_service', length=4)
291 b'M\xb1<S'
292 >>> Vault.create_hash(phrase, b'some_service', length=16)
293 b'M\xb1<S\x827E\xd1M\xaf\xf8~\xc8n\x10\xcc'
294 >>> Vault.create_hash(phrase, b'NOSUCHSERVICE', length=16)
295 b'\x1c\xc3\x9c\xd9\xb6\x1a\x99CS\x07\xc41\xf4\x85#s'
297 """
298 phrase = cls._get_binary_string(phrase)
299 assert not isinstance(phrase, str)
300 salt = bytes(service) + cls._UUID
301 return hashlib.pbkdf2_hmac(hash_name='sha1', password=phrase,
302 salt=salt, iterations=8, dklen=length)
304 def generate(
305 self, service_name: str | bytes | bytearray, /, *,
306 phrase: bytes | bytearray | str = b'',
307 ) -> bytes:
308 r"""Generate a service passphrase.
310 Args:
311 service_name:
312 The service name.
313 phrase:
314 If given, override the passphrase given during
315 construction.
317 If a text string, then the byte representation must be
318 unique.
320 Returns:
321 The service passphrase.
323 Raises:
324 AmbiguousByteRepresentationError:
325 The phrase is a text string with differing NFC- and
326 NFD-normalized UTF-8 byte representations.
328 Examples:
329 >>> phrase = b'She cells C shells bye the sea shoars'
330 >>> # Using default options in constructor.
331 >>> Vault(phrase=phrase).generate(b'google')
332 b': 4TVH#5:aZl8LueOT\\{'
333 >>> # Also possible:
334 >>> Vault().generate(b'google', phrase=phrase)
335 b': 4TVH#5:aZl8LueOT\\{'
337 """
338 hash_length = self._estimate_sufficient_hash_length()
339 assert hash_length >= 1
340 # Ensure the phrase is a bytes object. Needed later for safe
341 # concatenation.
342 if isinstance(service_name, str):
343 service_name = service_name.encode('utf-8')
344 elif not isinstance(service_name, bytes):
345 service_name = bytes(service_name)
346 assert_type(service_name, bytes)
347 if not phrase:
348 phrase = self._phrase
349 phrase = self._get_binary_string(phrase)
350 # Repeat the passphrase generation with ever-increasing hash
351 # lengths, until the passphrase can be formed without exhausting
352 # the sequin. See the guarantee in the create_hash method for
353 # why this works.
354 while True:
355 try:
356 required = self._required[:]
357 seq = sequin.Sequin(self.create_hash(
358 phrase=phrase, service=service_name, length=hash_length))
359 result = bytearray()
360 while len(result) < self._length:
361 pos = seq.generate(len(required))
362 charset = required.pop(pos)
363 # Determine if an unlucky choice right now might
364 # violate the restriction on repeated characters.
365 # That is, check if the current partial passphrase
366 # ends with r - 1 copies of the same character
367 # (where r is the repeat limit that must not be
368 # reached), and if so, remove this same character
369 # from the current character's allowed set.
370 if self._repeat and result:
371 bad_suffix = bytes(result[-1:]) * (self._repeat - 1)
372 if result.endswith(bad_suffix):
373 charset = self._subtract(bytes(result[-1:]),
374 charset)
375 pos = seq.generate(len(charset))
376 result.extend(charset[pos:pos+1])
377 except sequin.SequinExhaustedError:
378 hash_length *= 2
379 else:
380 return bytes(result)
382 @staticmethod
383 def _is_suitable_ssh_key(key: bytes | bytearray, /) -> bool:
384 """Check whether the key is suitable for passphrase derivation.
386 Currently, this only checks whether signatures with this key
387 type are deterministic.
389 Args:
390 key: SSH public key to check.
392 Returns:
393 True if and only if the key is suitable for use in deriving
394 a passphrase deterministically.
396 """
397 deterministic_signature_types = {
398 'ssh-ed25519':
399 lambda k: k.startswith(b'\x00\x00\x00\x0bssh-ed25519'),
400 'ssh-ed448':
401 lambda k: k.startswith(b'\x00\x00\x00\x09ssh-ed448'),
402 'ssh-rsa':
403 lambda k: k.startswith(b'\x00\x00\x00\x07ssh-rsa'),
404 }
405 return any(v(key) for v in deterministic_signature_types.values())
407 @classmethod
408 def phrase_from_key(
409 cls, key: bytes | bytearray, /
410 ) -> bytes:
411 """Obtain the master passphrase from a configured SSH key.
413 vault allows the usage of certain SSH keys to derive a master
414 passphrase, by signing the vault UUID with the SSH key. The key
415 type must ensure that signatures are deterministic.
417 Args:
418 key: The (public) SSH key to use for signing.
420 Returns:
421 The signature of the vault UUID under this key, unframed but
422 encoded in base64.
424 Raises:
425 ValueError:
426 The SSH key is principally unsuitable for this use case.
427 Usually this means that the signature is not
428 deterministic.
430 Examples:
431 >>> import base64
432 >>> # Actual Ed25519 test public key.
433 >>> public_key = bytes.fromhex('''
434 ... 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39
435 ... 00 00 00 20
436 ... 81 78 81 68 26 d6 02 48 5f 0f ff 32 48 6f e4 c1
437 ... 30 89 dc 1c 6a 45 06 09 e9 09 0f fb c2 12 69 76
438 ... ''')
439 >>> expected_sig_raw = bytes.fromhex('''
440 ... 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39
441 ... 00 00 00 40
442 ... f0 98 19 80 6c 1a 97 d5 26 03 6e cc e3 65 8f 86
443 ... 66 07 13 19 13 09 21 33 33 f9 e4 36 53 1d af fd
444 ... 0d 08 1f ec f8 73 9b 8c 5f 55 39 16 7c 53 54 2c
445 ... 1e 52 bb 30 ed 7f 89 e2 2f 69 51 55 d8 9e a6 02
446 ... ''')
447 >>> # Raw Ed25519 signatures are 64 bytes long.
448 >>> signature_blob = expected_sig_raw[-64:]
449 >>> phrase = base64.standard_b64encode(signature_blob)
450 >>> Vault.phrase_from_key(phrase) == expected # doctest:+SKIP
451 True
453 """
454 if not cls._is_suitable_ssh_key(key):
455 raise ValueError(
456 'unsuitable SSH key: bad key, or signature not deterministic')
457 with ssh_agent_client.SSHAgentClient() as client:
458 raw_sig = client.sign(key, cls._UUID)
459 keytype, trailer = client.unstring_prefix(raw_sig)
460 signature_blob = client.unstring(trailer)
461 return bytes(base64.standard_b64encode(signature_blob))
463 @staticmethod
464 def _subtract(
465 charset: bytes | bytearray, allowed: bytes | bytearray,
466 ) -> bytearray:
467 """Remove the characters in charset from allowed.
469 This preserves the relative order of characters in `allowed`.
471 Args:
472 charset:
473 Characters to remove. Must not contain duplicate
474 characters.
475 allowed:
476 Character set to remove the other characters from. Must
477 not contain duplicate characters.
479 Returns:
480 The pruned "allowed" character set.
482 Raises:
483 ValueError:
484 `allowed` or `charset` contained duplicate characters.
486 """
487 allowed = (allowed if isinstance(allowed, bytearray)
488 else bytearray(allowed))
489 assert_type(allowed, bytearray)
490 if len(frozenset(allowed)) != len(allowed):
491 raise ValueError('duplicate characters in set')
492 if len(frozenset(charset)) != len(charset):
493 raise ValueError('duplicate characters in set')
494 for c in charset:
495 try:
496 pos = allowed.index(c)
497 except ValueError:
498 pass
499 else:
500 allowed[pos:pos+1] = []
501 return allowed