BPL - 17 days ago 5x
Python Question

# How to split text into chunks minimizing the solution?

OVERVIEW

I got a set of possible valid chunks I can use to split a text (if possible).

How can i split a given text using these chunks such as the result will be optimized (minimized) in terms of the number of resulting chunks?

TEST SUITE

``````if __name__ == "__main__":
import random
import sys

random.seed(1)

# 1) Testing robustness
examples = []
sys.stdout.write("Testing correctness...")
N = 50
large_number = "3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481"
for i in range(100):
for j in range(i):
choices = random.sample(range(i), j)
examples.append((choices, large_number))

for (choices, large_number) in examples:
get_it_done(choices, large_number)
sys.stdout.write("OK")

# 2) Testing correctness
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
# Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
# Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
# Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
# Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]

score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag

print(
"Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))

# 3) TODO: Testing optimization, it should provide (if possible)
#          minimal cases
``````

QUESTION

How could I solve this problem on python without using a brute-force approach?

Sorry, the implementation is a bit hacky. But I think it always returns the optimal answer. (Did not proove, though.) It is a fast and complete implementation in python and returns the correct answers for all proposed use cases.

The algorithm is recursive and works as follows:

1. start at the beginning of the text.
2. find matching chunks that can be used as the first chunk.
3. for each matching chunk, recursively start at step 1. with the rest of the text (i.e. the chunk removed from the start) and collect the solutions
4. return the shortest of the collected solutions

When the algorithm is done, all possible paths (and the not possible ones, i.e. no match at the end) should have been traversed exactly once.

To perform step 2 efficiently, I build a patricia tree for the choices so the possible chunks matching the beginning of the text can be looked up quickly.

``````def get_seq_in_tree(tree, choice):
if type(tree)!=dict:
if choice == tree:
return [choice]
return []
for i in range(1, len(choice)+1):
if choice[:i] in tree:
return [choice[:i]] + get_seq_in_tree(tree[choice[:i]], choice[i:])
return []

def seq_can_end_here(tree, seq):
res = []
last = tree
for e, c in enumerate(seq):
if '' in last[c]:
res.append(e+1)
last = last[c]
return res

def build_tree(choices):
tree = {}
choices = sorted(choices)
for choice in choices:
last = tree
for c in choice:
if c not in last:
last[c] = {}
last = last[c]
last['']=None
return tree

solution_cache = {}
ncalls = 0

def solve(tree, number):
global solution_cache
global ncalls
ncalls +=1

# take every path only once
if number in solution_cache:
return solution_cache[number]

solutions = []
seq =  get_seq_in_tree(tree, number)
endings = seq_can_end_here(tree, seq)
for i in reversed(endings):
current_solution = []
current_solution.append(number[:i])
if i == len(number):
solutions.append(current_solution)
else:
next_solution = solve(tree, number[i:])
if next_solution:
solutions.append(current_solution + next_solution)
if not solutions:
return None

shortest_solution = sorted([(len(solution), solution) for solution in solutions])[0][1]

solution_cache[number] = shortest_solution
return shortest_solution

def get_it_done(choices, number):
tree = build_tree(choices)
solution = solve(tree, number)
return solution

if __name__ == "__main__":

examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
## Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
## Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
### Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
## Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]

score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag

print("Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))
``````

I guess the complexity is something like O(L * N * log(C)) where L is the length of the text, N is the size of the vocabulary and C is the number of choices.

EDIT: Included the missing test case.