BPL - 10 months ago 38

Python Question

**OVERVIEW**

I got a set of possible valid chunks I can use to split a text (if possible).

How can i split a given text using these chunks such as the result will be optimized (minimized) in terms of the number of resulting chunks?

**TEST SUITE**

`if __name__ == "__main__":`

import random

import sys

random.seed(1)

# 1) Testing robustness

examples = []

sys.stdout.write("Testing correctness...")

N = 50

large_number = "3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481"

for i in range(100):

for j in range(i):

choices = random.sample(range(i), j)

examples.append((choices, large_number))

for (choices, large_number) in examples:

get_it_done(choices, large_number)

sys.stdout.write("OK")

# 2) Testing correctness

examples = [

# Example1 ->

# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']

(

[

"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",

"10", "20", "30", "40", "50", "60", "70", "80", "90",

"100", "200", "300", "400", "500", "600", "700", "800", "900",

"012345678910203040506070"

],

"0123456789102030405060708090100200300400500600700800900"

),

# Example2

## Solution ['100']

(

["0", "1", "10", "100"],

"100"

),

# Example3

## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']

(

[

"10", "20", "30", "40", "50", "60", "70", "80", "90",

"012345678910203040506070",

"101234567891020304050",

"6070809010020030040050",

"0600700800900"

],

"10123456789102030405060708090100200300400500600700800900"

),

# Example4

### Solution ['12', '34', '56', '78', '90']

(

[

"12", "34", "56", "78", "90",

"890",

],

"1234567890"

),

# Example5

## Solution ['12', '34']

(

[

"1", "2", "3",

"12", "23", "34"

],

"1234"

),

# Example6

## Solution ['100', '10']

(

["0", "1", "10", "100"],

"10010"

)

]

score = 0

for (choices, large_number) in examples:

res = get_it_done(choices, large_number)

flag = "".join(res) == large_number

print("{0}\n{1}\n{2} --> {3}".format(

large_number, "".join(res), res, flag))

print('-' * 80)

score += flag

print(

"Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))

# 3) TODO: Testing optimization, it should provide (if possible)

# minimal cases

How could I solve this problem on python without using a brute-force approach?

Answer Source

Sorry, the implementation is a bit hacky. But I think it always returns the optimal answer. (Did not proove, though.) It is a fast and complete implementation in python and returns the correct answers for all proposed use cases.

The algorithm is recursive and works as follows:

- start at the beginning of the text.
- find matching chunks that can be used as the first chunk.
- for each matching chunk, recursively start at step 1. with the rest of the text (i.e. the chunk removed from the start) and collect the solutions
- return the shortest of the collected solutions

When the algorithm is done, all possible paths (and the not possible ones, i.e. no match at the end) should have been traversed exactly once.

To perform step 2 efficiently, I build a patricia tree for the choices so the possible chunks matching the beginning of the text can be looked up quickly.

```
def get_seq_in_tree(tree, choice):
if type(tree)!=dict:
if choice == tree:
return [choice]
return []
for i in range(1, len(choice)+1):
if choice[:i] in tree:
return [choice[:i]] + get_seq_in_tree(tree[choice[:i]], choice[i:])
return []
def seq_can_end_here(tree, seq):
res = []
last = tree
for e, c in enumerate(seq):
if '' in last[c]:
res.append(e+1)
last = last[c]
return res
def build_tree(choices):
tree = {}
choices = sorted(choices)
for choice in choices:
last = tree
for c in choice:
if c not in last:
last[c] = {}
last = last[c]
last['']=None
return tree
solution_cache = {}
ncalls = 0
def solve(tree, number):
global solution_cache
global ncalls
ncalls +=1
# take every path only once
if number in solution_cache:
return solution_cache[number]
solutions = []
seq = get_seq_in_tree(tree, number)
endings = seq_can_end_here(tree, seq)
for i in reversed(endings):
current_solution = []
current_solution.append(number[:i])
if i == len(number):
solutions.append(current_solution)
else:
next_solution = solve(tree, number[i:])
if next_solution:
solutions.append(current_solution + next_solution)
if not solutions:
return None
shortest_solution = sorted([(len(solution), solution) for solution in solutions])[0][1]
solution_cache[number] = shortest_solution
return shortest_solution
def get_it_done(choices, number):
tree = build_tree(choices)
solution = solve(tree, number)
return solution
if __name__ == "__main__":
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
## Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
## Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
### Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
## Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]
score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag
print("Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))
```

I guess the complexity is something like O(L * N * log(C)) where L is the length of the text, N is the size of the vocabulary and C is the number of choices.

**EDIT:** Included the missing test case.