Lakshmi KrishnaKumaar Lakshmi KrishnaKumaar -4 years ago 91
Python Question

Using chaos game representation for gene sequences [Python program]

I need to represent many gene sequences using chaos game representation I got this python code from BoĊĦtjan Cigan's blog (https://bostjan-cigan.com/chaos-game-representation-of-gene-structure-in-python/)

Author: Bostjan Cigan



https://bostjan-cigan.com



import collections
from collections import OrderedDict
from matplotlib import pyplot as plt
from matplotlib import cm

import pylab
import math

f = open("ensemblSeq.fa")
s1 = f.read()
data = "".join(s1.split("\n")[1:])

def count_kmers(sequence, k):
d = collections.defaultdict(int)
for i in xrange(len(data)-(k-1)):
d[sequence[i:i+k]] +=1
for key in d.keys():
if "N" in key:
del d[key]
return d

def probabilities(kmer_count, k):
probabilities = collections.defaultdict(float)
N = len(data)
for key, value in kmer_count.items():
probabilities[key] = float(value) / (N - k + 1)
return probabilities

def chaos_game_representation(probabilities, k):
array_size = int(math.sqrt(4**k))
chaos = []
for i in range(array_size):
chaos.append([0]*array_size)

maxx = array_size
maxy = array_size
posx = 1
posy = 1
for key, value in probabilities.items():
for char in key:
if char == "T":
posx += maxx / 2
elif char == "C":
posy += maxy / 2
elif char == "G":
posx += maxx / 2
posy += maxy / 2
maxx = maxx / 2
maxy /= 2
chaos[posy-1][posx-1] = value
maxx = array_size
maxy = array_size
posx = 1
posy = 1

return chaos

f3 = count_kmers(data, 3)
f4 = count_kmers(data, 4)

f3_prob = probabilities(f3, 3)
f4_prob = probabilities(f4, 4)

chaos_k3 = chaos_game_representation(f3_prob, 3)
pylab.title('Chaos game representation for 3-mers')
pylab.imshow(chaos_k3, interpolation='nearest', cmap=cm.gray_r)
pylab.show()

chaos_k4 = chaos_game_representation(f4_prob, 4)
pylab.title('Chaos game representation for 4-mers')
pylab.imshow(chaos_k4, interpolation='nearest', cmap=cm.gray_r)
pylab.show()


This code works fine but I have many sequence files I need to iterate through each fasta file in the folder and get individual plots stored in a folder with the name of the image file corresponding to the name of the fasta file how can I modify the code according to my need

I am new to python as well as StackOverflow if any mistake is there kindly ignore

Thanks in advance

Answer Source

So, if you want to apply your code on every file in your directory a very simple way to do this is calling all files inside of a for-loop. I suggest the following:

import collections
import os
from collections import OrderedDict
from matplotlib import pyplot as plt
from matplotlib import cm

import pylab
import math

def count_kmers(sequence, k):
    d = collections.defaultdict(int)
    for i in xrange(len(data)-(k-1)):
        d[sequence[i:i+k]] +=1
    for key in d.keys():
         if "N" in key:
             del d[key]
    return d

def probabilities(kmer_count, k):
    probabilities = collections.defaultdict(float)
    N = len(data)
    for key, value in kmer_count.items():
        probabilities[key] = float(value) / (N - k + 1)
    return probabilities

def chaos_game_representation(probabilities, k):
    array_size = int(math.sqrt(4**k))
    chaos = []
    for i in range(array_size):
        chaos.append([0]*array_size)

    maxx = array_size
    maxy = array_size
    posx = 1
    posy = 1
    for key, value in probabilities.items():
        for char in key:
            if char == "T":
                posx += maxx / 2
            elif char == "C":
                posy += maxy / 2
            elif char == "G":
                 posx += maxx / 2
                 posy += maxy / 2
            maxx = maxx / 2
            maxy /= 2
        chaos[posy-1][posx-1] = value
        maxx = array_size
        maxy = array_size
        posx = 1
        posy = 1
    return chaos

if __name__ == "__main__":
    PATH = os.getcwd()
    filelist = sorted([os.path.join(PATH, f) for f in os.listdir(PATH) if f.endswith('.fa')])
    for file in filelist:
        f = open(file)
        s1 = f.read()
        data = "".join(s1.split("\n")[1:])
        f3 = count_kmers(data, 3)
        f4 = count_kmers(data, 4)

        f3_prob = probabilities(f3, 3)
        f4_prob = probabilities(f4, 4)

        chaos_k3 = chaos_game_representation(f3_prob, 3)
        pylab.title('Chaos game representation for 3-mers')
        pylab.imshow(chaos_k3, interpolation='nearest', cmap=cm.gray_r)
        pylab.savefig(os.path.splitext(file)[0]+'chaos3.png')
        pylab.show()

        chaos_k4 = chaos_game_representation(f4_prob, 4)
        pylab.title('Chaos game representation for 4-mers')
        pylab.imshow(chaos_k4, interpolation='nearest', cmap=cm.gray_r)
        pylab.savefig(os.path.splitext(file)[0]+'chaos4.png')
        pylab.show()

I just wrapped a loop around and added a pylab.savefig() call. Furthermore I used os to get the filenames from your directory. It should work now.

Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download