"""Encoding and decoding of arrays with fixed number of unique symbols.
While computing BDM dataset blocks have to be encoded into simple hashable objects
such as strings or integers for efficient lookup of CTM values from reference
datasets.
Currently string-based keys are used in CTM datasets.
However, this may be changed to integer keys in the future
in order to lower the memory footprint.
Integer encoding can be also used for easy generation of objects
of fixed dimensionality as each such object using a fixed,
finite alphabet of symbols can be uniquely mapped to an integer code.
"""
from collections import deque
import numpy as np
from .utils import prod
[docs]def array_from_string(x, shape, cast_to=int):
"""Make array from string code.
Parameters
----------
x : str
String code.
shape : tuple
Desired shape of the output array.
cast_to : type or None
Cast array to given type. No casting if ``None``.
Defaults to integer type.
Returns
-------
array_like
Array encoded in the string code.
Examples
--------
>>> array_from_string('1010', shape=(4,))
array([1, 0, 1, 0])
>>> array_from_string('1000', shape=(2, 2))
array([[1, 0],
[0, 0]])
"""
arr = np.array(list(x))
if arr.ndim == 0:
arr = arr.reshape((1, ))
if cast_to:
arr = arr.astype(cast_to)
return arr.reshape(shape)
[docs]def string_from_array(arr):
"""Encode an array as a string code.
Parameters
----------
arr : (N, k) array_like
*Numpy* array.
Returns
-------
str
String code of an array.
Examples
--------
>>> string_from_array(np.array([1, 0, 0]))
'100'
>>> string_from_array(np.array([[1,0], [3,4]]))
'1034'
"""
return ''.join(map(str, arr.flat))
[docs]def encode_sequence(seq, base=2):
"""Encode sequence of integer-symbols.
Parameters
----------
seq : (N, ) array_like
Sequence of integer symbols represented as 1D *Numpy* array.
base : int
Encoding base.
Should be equal to the number of unique symbols in the alphabet.
Returns
-------
int
Integer code of a sequence.
Raises
------
AttributeError
If `seq` is not 1D.
TypeError
If `seq` is not of integer type.
ValueError
If `seq` contain values which are negative or beyond the size
of the alphabet (encoding base).
Examples
--------
>>> encode_sequence(np.array([1, 0, 0]))
4
"""
if seq.size == 0:
return 0
if seq.ndim != 1:
raise AttributeError("'seq' has to be a 1D array")
if seq.dtype != np.int:
raise TypeError("'seq' has to be of integer dtype")
if not (seq >= 0).all():
raise ValueError("'seq' has to conisist of non-negative integers")
proper_values = np.arange(base)
if not np.isin(seq, proper_values).all():
raise ValueError("There are symbol codes greater than {}".format(base-1))
code = 0
for i, x in enumerate(reversed(seq)):
if x > 0:
code += x * base**i
return code
[docs]def decode_sequence(code, base=2, min_length=None):
"""Decode sequence from a sequence code.
Parameters
----------
code : int
Non-negative integer.
base : int
Encoding base.
Should be equal to the number of unique symbols in the alphabet.
min_length : int or None
Minimal number of represented bits.
Use shortest representation if ``None``.
Returns
-------
array_like
1D *Numpy* array.
Examples
--------
>>> decode_sequence(4)
array([1, 0, 0])
"""
bits = deque()
while code > 0:
code, rest = divmod(code, base)
bits.appendleft(rest)
n = len(bits)
if min_length and n < min_length:
for _ in range(min_length - n):
bits.appendleft(0)
return np.array(bits)
[docs]def encode_array(x, base=2, **kwds):
"""Encode array of integer-symbols.
Parameters
----------
x : (N, k) array_like
Array of integer symbols.
base : int
Encoding base.
**kwds :
Keyword arguments passed to :py:func:`numpy.ravel`.
Returns
-------
int
Integer code of an array.
"""
seq = np.ravel(x, **kwds)
return encode_sequence(seq, base=base)
[docs]def decode_array(code, shape, base=2, **kwds):
"""Decode array of integer-symbols from a sequence code.
Parameters
----------
code : int
Non-negative integer.
shape : tuple of ints
Expected array shape.
base : int
Encoding base.
**kwds :
Keyword arguments passed to :py:func:`numpy.reshape`.
Returns
-------
array_like
*Numpy* array.
"""
length = prod(shape)
seq = decode_sequence(code, base=base, min_length=length)
if seq.size > length:
raise ValueError("{} does not encode array of shape {}".format(code, shape))
arr = seq.reshape(shape, **kwds)
return arr
[docs]def normalize_array(X):
"""Normalize array so symbols are consecutively mapped to 0, 1, 2, ...
Parameters
----------
X : array_like
*Numpy* array of arbitrary dimensions.
Returns
-------
array_like
*Numpy* array of the same dimensions with mapped symbols.
Examples
--------
>>> X = np.array([1, 2, 3], dtype=int)
>>> normalize_array(X)
array([0, 1, 2])
>>> X = np.array([[1,2],[2,1]], dtype=int)
>>> normalize_array(X)
array([[0, 1],
[1, 0]])
"""
shp = X.shape
ndim = X.ndim
dct = {}
counter = 0
X = X.copy()
if ndim > 1:
X = X.ravel()
for idx, x in np.ndenumerate(X):
if x not in dct:
dct[x] = counter
counter += 1
X[idx] = dct[x]
if ndim > 1:
X = X.reshape(shp)
return X
[docs]def normalize_key(key):
"""Normalize part key so symbols are consecutively mapped to 0, 1, 2, ...
Parameters
----------
key : str
Part key as returned by :py:func:`string_from_array`.
Returns
-------
str
Normalized key with mapped symbols.
Examples
--------
>>> normalize_key('123')
'012'
>>> normalize_key('40524')
'01230'
"""
dct = {}
counter = 0
norm_key = ''
for x in key:
if x not in dct:
dct[x] = str(counter)
counter += 1
norm_key += dct[x]
return norm_key