Carolina F. Carolina F. - 2 months ago 14
Python Question

How to convert a nltk tree (Stanford) into newick format in python?

I have this Stanford tree and I want to convert this into newick format.

(ROOT
(S
(NP (DT A) (NN friend))
(VP
(VBZ comes)
(NP
(NP (JJ early))
(, ,)
(NP
(NP (NNS others))
(SBAR
(WHADVP (WRB when))
(S (NP (PRP they)) (VP (VBP have) (NP (NN time))))))))))

Answer

There might be ways to do this just using string processing, but I would parse them and print them in the newick format recursively. A somewhat minimal implementation:

import re

class Tree(object):
    def __init__(self, label):
        self.label = label
        self.children = []

    @staticmethod
    def _tokenize(string):
        return list(reversed(re.findall(r'\(|\)|[^ \n\t()]+', string)))

    @classmethod
    def from_string(cls, string):
        tokens = cls._tokenize(string)
        return cls._tree(tokens)

    @classmethod
    def _tree(cls, tokens):
        t = tokens.pop()
        if t == '(':
            tree = cls(tokens.pop())
            for subtree in cls._trees(tokens):
                tree.children.append(subtree)
            return tree
        else:
            return cls(t)

    @classmethod
    def _trees(cls, tokens):
        while True:
            if not tokens:
                raise StopIteration
            if tokens[-1] == ')':
                tokens.pop()
                raise StopIteration
            yield cls._tree(tokens)

    def to_newick(self):
        if self.children and len(self.children) == 1:
            return ','.join(child.to_newick() for child in self.children)
        elif self.chilren:
            return '(' + ','.join(child.to_newick() for child in self.children) + ')'
        else:
            return self.label

Note that, of course, information gets lost during the conversion, since only terminal nodes are kept. Usage:

>>> s = """(ROOT (..."""
>>> Tree.from_string(s).to_newick()
...
Comments