Carolina F. Carolina F. - 4 months ago 31
Python Question

How to convert a nltk tree (Stanford) into newick format in python?

I have this Stanford tree and I want to convert this into newick format.

(NP (DT A) (NN friend))
(VBZ comes)
(NP (JJ early))
(, ,)
(NP (NNS others))
(WHADVP (WRB when))
(S (NP (PRP they)) (VP (VBP have) (NP (NN time))))))))))


There might be ways to do this just using string processing, but I would parse them and print them in the newick format recursively. A somewhat minimal implementation:

import re

class Tree(object):
    def __init__(self, label):
        self.label = label
        self.children = []

    def _tokenize(string):
        return list(reversed(re.findall(r'\(|\)|[^ \n\t()]+', string)))

    def from_string(cls, string):
        tokens = cls._tokenize(string)
        return cls._tree(tokens)

    def _tree(cls, tokens):
        t = tokens.pop()
        if t == '(':
            tree = cls(tokens.pop())
            for subtree in cls._trees(tokens):
            return tree
            return cls(t)

    def _trees(cls, tokens):
        while True:
            if not tokens:
                raise StopIteration
            if tokens[-1] == ')':
                raise StopIteration
            yield cls._tree(tokens)

    def to_newick(self):
        if self.children and len(self.children) == 1:
            return ','.join(child.to_newick() for child in self.children)
        elif self.chilren:
            return '(' + ','.join(child.to_newick() for child in self.children) + ')'
            return self.label

Note that, of course, information gets lost during the conversion, since only terminal nodes are kept. Usage:

>>> s = """(ROOT (..."""
>>> Tree.from_string(s).to_newick()