Thibault Robin Thibault Robin - 3 months ago 8
Java Question

Fastest way to extract uppercase characters in Java

I am currently working with DNA sequences in the form of strings where introns are in lowercase and exons in uppercase characters. The aim of the method is to retrieve the exons in the form of a String as fast as possible.

Example of a sequence :


ATGGATGACAGgtgagaggacactcgggtcccagccccaggctctgccctcaggaagggggtcagctctcaggggcatctccctctcacagcccagccctggggatgatgtgggagccccatttatacacggtgcctccttctctcctagAGCCTACATAG


My first version was using the String replaceAll() method but it was particularly slow:

public String getExons(String sequence) {
return sequence.replaceAll("[atgc]", "");
}


So I tried a new version which improved performance but remains fairly slow:

public String getExons(String sequence) {
StringBuilder exonBuilder = new StringBuilder();

for (int i = 0; i < sequence.length(); i++) {
char c = sequence.charAt(i);
if (c == 'A' || c == 'T' || c == 'G' || c == 'C') exonBuilder.append(c);
}
return exonBuilder.toString();


Is there another way to do it which would improve performance ?

vz0 vz0
Answer

You need to use a char array with double pointer trick. I get this results in my machine:

Edit: updated with warmup phase. Java is OpenJDK 8 from Ubuntu 14 LTS

Edit2: hash table is the fastest by far margin.

Edit3: I had a bug in my code. The double pointer trick is the fastest.

GTCtgACgGT
getExons1: 1068
getExons2: 377
getExons3: 313
getExons3b: 251
getExons4: 586
getExons5: 189
getExons6: 671

The code:

import java.util.HashMap;
import java.util.Random;

public class Test1 {

    static String letters = "ATGCatgc";

    public static String buildRandomDNA(int size) {
        StringBuilder builder = new StringBuilder(size);
        Random r = new Random();

        for (int i = 0; i < size; ++i) {
            builder.append(letters.charAt(r.nextInt(letters.length())));
        }

        return builder.toString();
    }

    public static String getExons1(String sequence) {
        return sequence.replaceAll("[atgc]", "");
    }

    public static String getExons2(String sequence) {
        StringBuilder exonBuilder = new StringBuilder();

        for (int i = 0; i < sequence.length(); i++) {
            char c = sequence.charAt(i);
            if (c == 'A' || c == 'T' || c == 'G' || c == 'C')
                exonBuilder.append(c);
        }
        return exonBuilder.toString();
    }

    public static String getExons3(String sequence) {
        StringBuilder exonBuilder = new StringBuilder();

        for (int i = 0; i < sequence.length(); i++) {
            char c = sequence.charAt(i);
            if (c <= 'Z') {
                exonBuilder.append((char) c);
            }
        }

        return exonBuilder.toString();
    }

    public static String getExons3b(String sequence1) {
        char[] sequence = sequence1.toCharArray();
        StringBuilder exonBuilder = new StringBuilder();

        for (int i = 0; i < sequence.length; i++) {
            if (sequence[i] <= 'Z') {
                exonBuilder.append(sequence[i]);
            }
        }

        return exonBuilder.toString();
    }

    public static HashMap<String, String> M = new HashMap<String, String>();

    public static void buildTable(int size) {
        for (int a = 0; a < letters.length(); ++a) {
            for (int b = 0; b < letters.length(); ++b) {
                for (int c = 0; c < letters.length(); ++c) {
                    for (int d = 0; d < letters.length(); ++d) {
                        String key = "" + letters.charAt(a) + letters.charAt(b) + letters.charAt(c) + letters.charAt(d);
                        M.put(key, getExons1(key));
                    }
                }
            }
        }
    }

    public static String getExons4(String sequence1) {
        char[] sequence = sequence1.toCharArray();
        StringBuilder exonBuilder = new StringBuilder();

        for (int i = 0; i < sequence.length; i += 4) {
            exonBuilder.append(M.get(new String(sequence, i, 4)));
        }

        return exonBuilder.toString();
    }

    public static String getExons5(String sequence1) {
        char[] sequence = sequence1.toCharArray();
        int p = 0;

        for (int i = 0; i < sequence.length; i++) {
            if (sequence[i] <= 'Z') {
                sequence[p] = sequence[i];
                ++p;
            }
        }

        return new String(sequence, 0, p);
    }

    public static int dnatoint(char[] s, int start, int len) {
        int key = 0;
        for (; len > 0; len--, start++) {
            switch (s[start]) {
            case 'A': key = (key << 3) | 0; break;
            case 'C': key = (key << 3) | 1; break;
            case 'G': key = (key << 3) | 2; break;
            case 'T': key = (key << 3) | 3; break;
            case 'a': key = (key << 3) | 4; break;
            case 'c': key = (key << 3) | 5; break;
            case 'g': key = (key << 3) | 6; break;
            case 't': key = (key << 3) | 7; break;
            }
        }
        return key;
    }

    public static String[] M2 = new String[8*8*8*8];

    public static void buildtable2() {
        for (int a = 0; a < letters.length(); ++a) {
            for (int b = 0; b < letters.length(); ++b) {
                for (int c = 0; c < letters.length(); ++c) {
                    for (int d = 0; d < letters.length(); ++d) {
                        String key = "" + letters.charAt(a) + letters.charAt(b) + letters.charAt(c) + letters.charAt(d);
                        M2[dnatoint(key.toCharArray(), 0, 4)] = getExons1(key);
                    }
                }
            }
        }
    }

    public static String getExons6(String sequence1) {
        char[] sequence = sequence1.toCharArray();
        StringBuilder exonBuilder = new StringBuilder();

        assert (sequence.length % 4) == 0;

        for (int i = 0; i < sequence.length; i += 4) {
            exonBuilder.append(M2[dnatoint(sequence, i, 4)]);
        }

        return exonBuilder.toString();
    }

    public static void main(String[] args) {
        System.out.println(buildRandomDNA(10));

        String DNA = buildRandomDNA(50000000);
        String result;

        long t;

        buildTable(4);
        buildtable2();

        // WARMUP
        for (int i = 0; i < 4; ++i) {
            result = getExons1(DNA);

            if (! getExons2(DNA).equals(result))
                throw new RuntimeException("getExons2 fail");
            if (! getExons3(DNA).equals(result))
                throw new RuntimeException("getExons3 fail");
            if (! getExons3b(DNA).equals(result))
                throw new RuntimeException("getExons3b fail");
            if (! getExons4(DNA).equals(result))
                throw new RuntimeException("getExons4 fail");
            if (! getExons5(DNA).equals(result))
                throw new RuntimeException("getExons5 fail");
            if (! getExons6(DNA).equals(result))
                throw new RuntimeException("getExons6 fail");
        }

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons1(DNA);
        System.out.println("getExons1: " + (System.currentTimeMillis() - t) / 4);

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons2(DNA);
        System.out.println("getExons2: " + (System.currentTimeMillis() - t) / 4);

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons3(DNA);
        System.out.println("getExons3: " + (System.currentTimeMillis() - t) / 4);

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons3b(DNA);
        System.out.println("getExons3b: " + (System.currentTimeMillis() - t) / 4);

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons4(DNA);
        System.out.println("getExons4: " + (System.currentTimeMillis() - t) / 4);

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons5(DNA);
        System.out.println("getExons5: " + (System.currentTimeMillis() - t) / 4);

        t = System.currentTimeMillis();
        for (int i = 0; i < 4; ++i)
            getExons6(DNA);
        System.out.println("getExons6: " + (System.currentTimeMillis() - t) / 4);
    }
}
Comments