C. Hinsley

19 September 2024


The Lempel-Ziv (LZ) algorithm takes a binary string (though it generalizes to arbitrary finite alphabets, not just binary) and passes over it left-to-right, building a dictionary of newly observed substrings, and at the end counts the symbols in the dictionary to obtain a measure of complexity of the binary string. If you normalize the LZ complexity, multiplying by $\log_2(n)/n$, where $n$ is the length of the string, this complexity measure will converge for large $n$ to the entropy rate of a source channel which produced the string.

I was curious if there is a possibility that LZ complexity is sensitive to reversals of the binary string, because it does build a dictionary in a relatively chaotic way — perhaps indicative of a directional preference forward or backward — for sufficiently complex sequences. By sensitivity, I mean such that the discrepancy between the normalized Lempel-Ziv complexity of a binary string and its reversal fails to converge to zero for large $n$.

Below you can find a tabulation of these discrepancies:

Length ($n$) String LZ LZ reverse Discrepancy Normalized ($\log_2(n)/n$) Discrepancy
1 0 1 1 0 0.00000
2 00 2 2 0 0.00000
3 001 2 3 1 0.52832
4 0001 3 3 0 0.00000
5 0000 1 3 4 1 0.46439
6 0000 01 3 4 1 0.43083
7 0000 001 3 4 1 0.40105
8 0000 0010 4 5 1 0.37500
9 0000 0000 1 4 5 1 0.35221
10 0001 1001 00 6 4 2 0.66439
11 0001 1100 100 6 4 2 0.62899
12 0000 0010 0001 4 6 2 0.59749
13 0000 0010 0001 0 4 6 2 0.56930
14 0000 0001 1001 00 7 5 2 0.54391
15 0000 0001 1100 100 7 5 2 0.52092
16 0001 1110 0110 0100 8 5 3 0.75000
17 0001 1010 0111 0010 0 8 5 3 0.72132
18 0000 0010 0110 1001 11 5 8 3 0.69499
19 0000 0010 0011 0001 001 5 8 3 0.67073
20 0000 0001 1110 0110 0100 9 6 3 0.64829
21 0000 0001 1010 0111 0010 0 9 6 3 0.62747
22 0000 0001 1010 0110 1111 11 9 6 3 0.60810
23 0000 0001 1001 0100 0000 100 9 6 3 0.59003
24 0000 0000 0000 0010 0001 1110 6 9 3 0.57312
25 0010 0111 0000 1110 1111 0000 1 6 10 4 0.74302
26 0001 1010 1100 1101 0011 1001 00 10 6 4 0.72314
27 0000 0010 0000 0110 0101 0000 111 6 10 4 0.70443
28 0000 0010 0000 0110 0010 1000 0111 6 10 4 0.68676
29 0000 0001 1110 1000 1101 0000 0010 0 11 7 4 0.67007
30 0000 0001 1010 1100 1101 0011 1001 00 11 7 4 0.65425
31 0001 0000 1011 1110 1001 0110 1011 011 11 7 4 0.63925
32 0001 0000 0001 0001 0100 0000 0110 1010 7 11 4 0.62500
33 0001 1110 0000 1011 0011 1100 1011 0010 0 11 6 5 0.76430
34 0001 1110 1001 0011 0011 1100 1001 1001 00 11 6 5 0.74816
35 0101 0110 0000 1101 0110 0110 1001 1100 100 12 7 5 0.73275

Julia code used to calculate the data above:

using Base.Threads

# Lempel-Ziv 1976 complexity (no normalization).
function LZ76_complexity(sequence::Vector{Int})::Float64
    if isempty(sequence)
        return 0.0
    end

    complexity = 0
    i = 1
    n = length(sequence)
    # b = length(unique(sequence))  # Unused variable, can be removed if not needed

    while i <= n
        max_match_length = 0
        
        # Search for the longest match in the window (from the start up to position i-1).
        for j in 1:(i-1)
            match_length = 0
            while (i + match_length <= n) && (sequence[j + match_length] == sequence[i + match_length])
                match_length += 1
                # Prevent overlapping matches.
                if (j + match_length > i - 1)
                    break
                end
            end
            if match_length > max_match_length
                max_match_length = match_length
            end
        end
        
        if max_match_length > 0
            # Found a match; increment complexity and move the pointer.
            complexity += 1
            i += max_match_length + 1
        else
            # No match found; treat the current symbol as a new phrase.
            complexity += 1
            i += 1
        end
    end

    return complexity
end

# Find the binary string of length n (WLOG starting with 0) which has the
# greatest discrepancy in LZ76 complexity from its reverse string.
function maximal_discrepancy(n::Int)::Tuple{Vector{Int}, Int}
    if n < 1
        error("Length of the binary string must be at least 1.")
    end
    
    if n == 1
        return ([0], 0)
    end
    
    if n == 2
        return ([0, 0], 0)
    end

    max_discrepancy = -1.0
    max_string = Int[]

    total_strings = 2^(n-1)  # Since the first bit is fixed to 0.

    # Initialize per-thread storage for maximum discrepancy and corresponding string.
    n_threads = Threads.nthreads()
    per_thread_max_discrepancy = fill(-1.0, n_threads)
    per_thread_max_string = [Int[] for _ in 1:n_threads]

    Threads.@threads for k in 0:(total_strings - 1)
        # Generate the binary string starting with 0.
        bits = zeros(Int, n)
        bits[1] = 0  # First bit is 0.

        for bit_pos in 2:n
            bits[bit_pos] = (k >> (n - bit_pos)) & 1
        end
        
        bits_reversed = reverse(bits)

        # Check for palindromic strings.
        if bits == bits_reversed
            continue  # Skip palindromic strings as discrepancy is zero.
        end

        # If n is even, check for anti-palindromic strings (equal to reverse and inversion).
        if iseven(n) && bits == (1 .- bits_reversed)
            continue  # Skip anti-palindromic strings as discrepancy is zero.
        end

        # Compute complexities.
        c_original = LZ76_complexity(bits)
        c_reversed = LZ76_complexity(bits_reversed)

        # Compute discrepancy.
        discrepancy = abs(c_original - c_reversed)
        
        # Update thread-local maximum discrepancy and corresponding string if needed.
        thread_id = threadid()
        if discrepancy > per_thread_max_discrepancy[thread_id]
            per_thread_max_discrepancy[thread_id] = discrepancy
            per_thread_max_string[thread_id] = copy(bits)
        end
    end

    # After the parallel loop, find the global maximum discrepancy and the corresponding string.
    for thread_id in 1:n_threads
        if per_thread_max_discrepancy[thread_id] > max_discrepancy
            max_discrepancy = per_thread_max_discrepancy[thread_id]
            max_string = copy(per_thread_max_string[thread_id])
        end
    end

    return (max_string, Int(max_discrepancy))
end