Source code for emlib.combinatorics

"""
Functions for combinatorics (combinations with repetitions, derangements, etc.)
"""
from __future__ import annotations
import numpy as _np
import itertools
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from typing import TypeVar, TypeAlias, Union, Iterator, Sequence
    from numpy.typing import ArrayLike
    seq_t: TypeAlias = Union[list, tuple, _np.ndarray]
    T = TypeVar("T")


[docs] def combinations_with_repetition(seq: Iterator[T], size: int) -> Iterator[tuple[T, ...]]: """ Yield all combinations of the elements of seq * items can be repeated: (0, 1, 1) is a valid answer * position is relevant: (0, 1, 2) is not the same as (2, 1, 0) .. note:: this is in fact a product of the sequence with itself (see ``itertools.product``) If position is not relevant, use ``itertools.combinations_with_replacement`` """ for group in itertools.product(seq, repeat=size): yield group
[docs] def derangements(seq: Sequence[T]) -> Iterator[list[T]]: """Permutations of seq where each element is not in its original position""" queue = [-1] lenlst = len(seq) while queue: i = queue[-1] + 1 if i == lenlst: queue.pop() elif i not in queue and i != len(queue) - 1: queue[-1] = i if len(queue) == lenlst: yield [seq[x] for x in queue] queue.append(-1) else: queue[-1] = i
def _distance_from_original(seqa: Sequence[T], seqb: Sequence[T]) -> float: distance = 0 for ia, a in enumerate(seqa): ib = seqb.index(a) distance += abs(ib - ia) return distance / len(seqa) def _max_distance(xs: Sequence) -> float: return _distance_from_original(xs, list(reversed(xs)))
[docs] def random_range(length: int) -> _np.ndarray: """ Return an array of ints from 0 to length-1, in random order Args: length: the length of the generated sequence Returns: the generated array """ s = _np.arange(length) _np.random.shuffle(s) return s
[docs] def distance_from_sorted(seq: ArrayLike, offset=0) -> int: """ Distance between seq and a sorted seq. of the same length """ seq = _np.asarray(seq) indices = _np.arange(offset, offset+len(seq)) arr = _np.asarray(seq) dist = _np.abs(arr - indices).sum() return int(dist)
_cached_random_distances = [0, 0, 1, 2, 4, 8, 11, 15, 21, 26, 33, 39, 47, 56, 64, 74, 84, 96, 107, 119, 132, 146, 160, 175, 191, 208, 225, 242, 260, 280, 300, 320, 340, 363, 385, 407, 431, 456, 480, 506, 532, 560, 587, 615, 645, 674, 705, 734, 769, 799, 834, 866, 900, 937, 970, 1007, 1043, 1082, 1120, 1159, 1198, 1240, 1280, 1323, 1366, 1408, 1450, 1495, 1540, 1584, 1634, 1679, 1728, 1775, 1823, 1874, 1925, 1975, 2029, 2077, 2135, 2185, 2241, 2296, 2352, 2405, 2460, 2524, 2579, 2641, 2698, 2756, 2821, 2882, 2945, 3007, 3072, 3131, 3200, 3266]
[docs] def random_distance(length: int, numseqs=1000) -> float: """ Calculate the distance between a sorted seq. and a random seq. for the given length. This is used to measure entropy in a seq. """ if length < len(_cached_random_distances): return _cached_random_distances[length] accum = 0 for _ in range(numseqs): seq = random_range(length) distance = distance_from_sorted(seq) accum += distance avg = accum / numseqs return avg
[docs] def unsortedness(seq: ArrayLike) -> float: """ The entropy of the ordering in this seq. Args: seq: the seq to evaluate Returns: a value between 0 and 1, where 0 means sorted and 1 means random order """ return distance_from_sorted(seq) / float(random_distance(len(seq)))
def _unsortx(seq, entropy, margin=0, debug=False, calculate_rating=False): if isinstance(margin, int): margin = (margin, margin) N = len(seq) - sum(margin) orig_idx = _np.arange(0, N) random_idx = _np.arange(0, orig_idx.size) _np.random.shuffle(random_idx) interpolated_idx = entropy * (random_idx - orig_idx) + orig_idx distances = abs(interpolated_idx - orig_idx) mean_dist = distances.sum() / distances.size def max_distance(N): delta = int(N * 0.5 + 0.5) idxs = _np.arange(N, dtype=int) return _np.abs((idxs - ((idxs + delta) % N))).sum() def worst_distribution(N): distance_left = max_distance(N) out = [] max_individual_distance = N - 1 for i in range(N): d = max_individual_distance if distance_left > max_individual_distance else distance_left distance_left -= d out.append(d) return out max_dist = max_distance(distances.size) resulting_entropy = mean_dist / (max_dist / distances.size) subseq_indices = _np.argsort(interpolated_idx) + margin[0] seq_array = _np.asarray(seq) out = seq_array.copy() if margin != (0, 0): out[margin[0]:-margin[1]] = out[subseq_indices] else: out = out[subseq_indices] if debug: assert len(out) == len(seq) assert set(out) == set(seq) if not calculate_rating: rating = 0 else: deviation = distances.std() maxdeviation = _np.array(worst_distribution(N)).std() rel_deviation = deviation / maxdeviation entropy_weight = 10 deviation_weight = 10 # rel_entropy_gap = abs(resulting_entropy - entropy) / entropy # the highest the rating, the better # rel_entropy_gap: the higher, the further away from desired entropy -> the worse # rel_deviation: the higher, the more concentrated the distances in # individual indices --> the worse rating = 1 - ((abs(resulting_entropy - entropy) / entropy) * 10 + rel_deviation * 10) / (entropy_weight + deviation_weight) if entropy > 0: if _np.all(out == seq_array): rating = 0 return out, rating
[docs] def unsort(seq: ArrayLike, entropy: float, margin=0, tolerance=0.05, numiter=100 ) -> _np.ndarray: """ Generate a permutation of xs unsorted according to the given entropy. Args: seq: a sequence to be unsorted entropy: 0=the original sequence is returned; 1=random sequence is returned margin: a number or tuple (left, right). These elements are left untouched numiter: the number of times the algorithm is run. The best result will be returned Returns: un unsorted version of *seq*, as numpy array, or None if * If entropy == 0: the original sequence is returned * If entropy == 1: a sequence is generated which is as random as possible (this does not mean that there cannot be any fixed points, it refers to the general result) Examples -------- .. code:: # unsort the first 10 numbers, leave 0 and 9 untouched at their places unsort(range(10), 0.5, margin=1) # unsort the given seq., do not touch the first too elements unsort((1,3, 5, 4, 0), 0.2, margin=(2, 0)) """ minentropy = entropy - tolerance*0.5 maxentropy = entropy + tolerance*0.5 results = [] for i in range(numiter): result, rating = _unsortx(seq, entropy, margin) if minentropy <= unsortedness(result) <= maxentropy: return result results.append((result, rating)) if not results: raise ValueError("Could not unsort") results.sort(key=lambda result: abs(unsortedness(result[0]) - entropy)) return results[0][0]
[docs] def unsort2(xs, entropy=1, margin=0, error=0.01, numiter=100): minentropy = entropy - error maxentropy = entropy + error bestentropy = 100 bestsol = None for i in range(numiter): sol = _unsort(xs, entropy, margin) ent = unsortedness(sol) if minentropy <= ent <= maxentropy: return sol if abs(ent - entropy) < abs(entropy - bestentropy): bestentropy = ent bestsol = sol assert bestsol is not None return bestsol
def _unsort(xs: ArrayLike, entropy=1, margin: int | tuple[int, int] = 0) -> _np.ndarray: """ generate a permutation of xs unsorted according to the given entropy. if entropy == 0: the original sequence is returned if entropy == 1: a sequence is generated which is as random as possible (this does not mean that there cannot be any fixed points, it refers to the general result) margin determines a range at the beginning and/or ending that will be left untouched. Examples: # unsort the first 10 numbers, leave 0 and 9 untouched at their places unsort(range(10), 0.5, margin=1) # unsort the given seq., do not touch the first too elements unsort((1,3, 5, 4, 0), 0.2, margin=(2, 0)) """ xs = _np.asarray(xs) if margin != 0: if isinstance(margin, int): margin0, margin1 = margin, margin else: margin0, margin1 = margin margin1 = len(xs) - margin1 unsorted = unsort(xs[margin0:margin1], entropy, 0) out = xs.copy() out[margin0:margin1] = unsorted return out if entropy == 0: return xs L = len(xs) idxs0 = _np.arange(L) random_distr = idxs0.copy() NAN = _np.nan _np.random.shuffle(random_distr) if entropy == 1: return xs[random_distr] out = _np.ones_like(idxs0) * NAN pick_order = idxs0.copy() _np.random.shuffle(pick_order) poss_iss = [] dx = entropy for i in pick_order: i0 = idxs0[i] i1 = random_distr[i] best_i = i0 + int((i1 - i0) * dx + 0.5) poss_is = [best_i] for j in range(1, L): n = best_i + j if n not in poss_is: poss_is.append(n) n = best_i - j if n not in poss_is: poss_is.append(n) poss_is = [poss_i for poss_i in poss_is if 0 <= poss_i < L] poss_iss.append(poss_is) for i, poss_is in zip(pick_order, poss_iss): i0 = idxs0[i] for poss_i in poss_is: if out[poss_i] != NAN: out[poss_i] = i0 break n_missing = out[_np.isnan(out)].size if n_missing > 0: missing = [i for i in idxs0 if i not in out] nans = [i for i in range(L) if _np.isnan(out[i])] distances = [(abs(i - j), i, j) for i in missing for j in nans] distances2 = sorted(distances) missing_index: dict[int, bool] = dict((m, True) for m in missing) for _, i, j in distances2: if missing_index[i]: if _np.isnan(out[j]): missing_index[i] = False out[j] = i n_missing -= 1 if n_missing <= 0: break indices = out.astype(int) return xs[indices] if __name__ == '__main__': import doctest doctest.testmod()