BPL BPL - 2 months ago 7
Python Question

How to split text into chunks minimizing the solution?

OVERVIEW

I got a set of possible valid chunks I can use to split a text (if possible).

How can i split a given text using these chunks such as the result will be optimized (minimized) in terms of the number of resulting chunks?

TEST SUITE

if __name__ == "__main__":
import random
import sys

random.seed(1)

# 1) Testing robustness
examples = []
sys.stdout.write("Testing correctness...")
N = 50
large_number = "3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481"
for i in range(100):
for j in range(i):
choices = random.sample(range(i), j)
examples.append((choices, large_number))

for (choices, large_number) in examples:
get_it_done(choices, large_number)
sys.stdout.write("OK")

# 2) Testing correctness
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
# Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
# Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
# Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
# Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]

score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag

print(
"Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))

# 3) TODO: Testing optimization, it should provide (if possible)
# minimal cases


QUESTION

How could I solve this problem on python without using a brute-force approach?

Answer

Sorry, the implementation is a bit hacky. But I think it always returns the optimal answer. (Did not proove, though.) It is a fast and complete implementation in python and returns the correct answers for all proposed use cases.

The algorithm is recursive and works as follows:

  1. start at the beginning of the text.
  2. find matching chunks that can be used as the first chunk.
  3. for each matching chunk, recursively start at step 1. with the rest of the text (i.e. the chunk removed from the start) and collect the solutions
  4. return the shortest of the collected solutions

When the algorithm is done, all possible paths (and the not possible ones, i.e. no match at the end) should have been traversed exactly once.

To perform step 2 efficiently, I build a patricia tree for the choices so the possible chunks matching the beginning of the text can be looked up quickly.

def get_seq_in_tree(tree, choice):
    if type(tree)!=dict:
        if choice == tree:
            return [choice]
        return []
    for i in range(1, len(choice)+1):
        if choice[:i] in tree:
            return [choice[:i]] + get_seq_in_tree(tree[choice[:i]], choice[i:])
    return []

def seq_can_end_here(tree, seq):
    res = []
    last = tree
    for e, c in enumerate(seq):
        if '' in last[c]:
            res.append(e+1)
        last = last[c]
    return res

def build_tree(choices):
    tree = {}
    choices = sorted(choices)
    for choice in choices:
        last = tree
        for c in choice:
            if c not in last:
                last[c] = {}
            last = last[c]
        last['']=None
    return tree

solution_cache = {}
ncalls = 0

def solve(tree, number):
    global solution_cache
    global ncalls
    ncalls +=1

    # take every path only once
    if number in solution_cache: 
        return solution_cache[number]

    solutions = []
    seq =  get_seq_in_tree(tree, number)
    endings = seq_can_end_here(tree, seq)
    for i in reversed(endings):
        current_solution = []
        current_solution.append(number[:i])
        if i == len(number):
            solutions.append(current_solution)
        else:
            next_solution = solve(tree, number[i:])
            if next_solution:
                solutions.append(current_solution + next_solution)
    if not solutions:
        return None

    shortest_solution = sorted([(len(solution), solution) for solution in solutions])[0][1]

    solution_cache[number] = shortest_solution
    return shortest_solution

def get_it_done(choices, number):
    tree = build_tree(choices)
    solution = solve(tree, number)
    return solution


if __name__ == "__main__":

    examples = [
        # Example1 ->
        # Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
        (
            [
                "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
                "10", "20", "30", "40", "50", "60", "70", "80", "90",
                "100", "200", "300", "400", "500", "600", "700", "800", "900",
                "012345678910203040506070"
            ],
            "0123456789102030405060708090100200300400500600700800900"
        ),
        ## Example2
        ## Solution ['100']
        (
            ["0", "1", "10", "100"],
            "100"
        ),
        ## Example3
        ## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
        (
            [
                "10", "20", "30", "40", "50", "60", "70", "80", "90",
                "012345678910203040506070",
                "101234567891020304050",
                "6070809010020030040050",
                "0600700800900"
            ],
            "10123456789102030405060708090100200300400500600700800900"
        ),
        ### Example4
        ### Solution ['12', '34', '56', '78', '90']
        (
            [
                "12", "34", "56", "78", "90",
                "890",
            ],
            "1234567890"
        ),
        ## Example5
        ## Solution ['12', '34']
        (
            [
                "1", "2", "3",
                "12", "23", "34"
            ],
            "1234"
        ),
        # Example6
        ## Solution ['100', '10']
        (
            ["0", "1", "10", "100"],
            "10010"
        )
    ]

    score = 0
    for (choices, large_number) in examples:
        res = get_it_done(choices, large_number)
        flag = "".join(res) == large_number
        print("{0}\n{1}\n{2} --> {3}".format(
            large_number, "".join(res), res, flag))
        print('-' * 80)
        score += flag

    print("Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))

I guess the complexity is something like O(L * N * log(C)) where L is the length of the text, N is the size of the vocabulary and C is the number of choices.

EDIT: Included the missing test case.