Kalyan Pradhan Kalyan Pradhan - 5 months ago 11
Java Question

Is there any compression method in java to reduce the number of charaters in a string?

I am currently facing a problem while compressing a string to fewer characters in java.

I have a huge string which is about 751396 characters and there is a requirement of compressing the string into a 1500 characters.

I have tried GZIP Compressor, Inflater & Deflater but these libraries return byte arrays

Then I tried LZ-String compressor in which I was able to get satisfactory results using UTF16 encoding and

base64
encoding, But these compression return some characters which are neither alphanumeric nor are they included in the symbols list provided.

N.B. The list for the Symbols is [+,-,*,/,!,@,#]

is there any other technique of compressing the string into another string with fewer characters and providing at least 30% of compression ratio.

The codes which I am using for GZip compression is as follows:-

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class GZIPCompression {

public static byte[] compress(final String str) throws IOException {
if ((str == null) || (str.length() == 0)) {
return null;
}
ByteArrayOutputStream obj = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(obj);
gzip.write(str.getBytes("UTF-8"));
gzip.close();
return obj.toByteArray();
}

public static String decompress(final byte[] compressed) throws IOException {
String outStr = "";
if ((compressed == null) || (compressed.length == 0)) {
return "";
}
if (isCompressed(compressed)) {
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(compressed));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(gis, "UTF-8"));

String line;
while ((line = bufferedReader.readLine()) != null) {
outStr += line;
}
} else {
outStr = new String(compressed);
}
return outStr;
}

public static boolean isCompressed(final byte[] compressed) {
return (compressed[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) && (compressed[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8));
}
}


The code for the Inflater & Deflater program is as follows:-

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;

public class Apple {

public static void main(String[] args) {
String sr = " [120,-100,-19,89,91,79,-21,56,16,-2,43,40,-49,104,55,113,-18,-68,-91,-23,21,104,90,-38,-62,10,-83,120,48,-83,91,34,-46,-92,-21,-92,8,-124,-8,-17,103,-100,-92,77,-22,-38,-25,112,86,27,-119,106,65,84,-86,103,-58,-10,124,-98,-15,101,-66,-66,43,25,126,-99,-112,116,-109,-60,41,81,46,-34,-107,-57,36,121,14,-29,-43,-20,109,3,77,101,-94,-100,43,120,-79,-115,50,63,-39,-58,25,8,52,16,-52,-97,-62,104,-79,19,-88,32,8,-29,37,-114,-77,-70,-92,71,-78,89,125,-36,-65,-33,-107,5,-50,-40,-120,-86,-11,39,82,-31,95,-77,-40,-48,-21,-94,15,82,13,11,-58,-115,112,-102,-6,-55,-126,-103,-7,126,-65,53,-22,-113,-64,-58,123,-63,97,52,37,-85,53,97,-106,-17,74,55,10,87,79,-39,96,-63,-100,65,76,31,-46,40,-116,73,-39,111,-38,3,81,97,18,108,-41,-113,-124,-126,-52,-48,-100,-62,-50,-89,120,-103,-107,-56,108,-99,9,71,52,92,-123,49,52,91,-41,-109,125,19,44,55,9,-51,102,-124,-82,-61,24,71,-96,5,85,-101,-92,25,-76,-78,48,-55,-51,71,-61,67,-103,-92,-49,6,-45,108,75,73,27,-80,-49,-62,53,-101,23,-64,-25,75,-96,89,103,72,-67,48,-44,11,-107,-83,-105,71,105,-8,-126,35,-119,29,-70,-48,74,-69,-10,-106,-18,92,-48,-98,104,122,-90,-85,48,93,10,-118,2,108,-78,-100,102,-55,38,85,46,-44,115,-27,46,-60,-123,23,-2,-106,82,18,-49,-33,-54,21,26,4,-109,-35,-86,-114,-15,107,23,-125,119,36,-125,70,-102,71,-55,23,-58,96,47,5,-60,-13,-61,-24,-80,-28,96,97,105,-31,52,-100,123,101,60,53,-61,112,33,12,48,54,19,-61,-56,74,-112,116,41,-127,-42,74,41,-28,-69,-4,34,-53,109,-68,-64,-113,17,1,-7,-3,77,-18,-8,44,-55,112,4,-39,-77,27,12,68,61,-102,-92,-23,126,112,-45,48,-64,-91,100,-67,14,-45,-76,88,11,-45,-2,-61,-75,-108,-113,-113,-13,67,0,-99,-114,12,64,-91,17,3,-128,-124,108,14,0,-46,28,-99,3,-32,104,66,0,-35,-82,12,64,-91,-111,0,48,-35,6,1,-40,-74,-54,1,-48,84,93,-120,-96,-33,-105,33,-88,52,98,4,-70,-34,96,8,116,-45,114,121,4,-70,24,-63,-27,-91,12,65,-91,-111,32,112,27,-116,-127,-127,44,-115,71,96,-70,66,4,87,87,50,4,-107,70,-116,-64,112,26,-116,-127,-87,89,54,-113,-64,21,-57,-32,-6,90,-122,-96,-46,-120,17,-104,77,34,-80,-112,-50,111,100,36,-55,-94,-31,80,-122,-96,-46,-120,17,-40,77,30,69,-74,-87,-15,89,-124,36,103,81,16,-56,16,84,26,49,2,71,111,112,31,56,-82,-55,-97,69,-70,110,10,17,-116,70,50,4,-107,70,-116,-64,117,26,68,-96,-87,-90,-31,-16,16,92,49,-124,-101,27,25,-124,74,35,-71,-110,-31,-38,108,16,3,-46,85,126,51,27,-106,56,-111,38,19,25,-122,74,35,-63,-48,-24,-99,-96,25,8,-103,-4,-61,66,-78,-99,-89,83,25,-122,74,35,-63,96,-94,38,115,-55,-46,85,-2,72,-78,52,113,28,102,51,25,-122,74,35,-63,96,55,-71,-93,53,-57,52,-8,45,109,-19,-10,-61,-61,-57,-71,-78,43,43,-90,25,-50,-74,48,57,-100,96,-73,41,-95,51,-118,-25,-49,121,93,48,25,-34,-113,6,-82,-83,-62,-3,91,124,116,-37,-123,67,-56,50,12,3,-23,-102,-19,-72,-106,-125,92,56,-25,-64,39,-16,99,-76,-51,54,-37,18,-28,106,-123,87,-60,-117,23,67,-126,-93,-76,27,1,-100,-36,-29,-68,-108,41,-86,-118,-78,18,41,94,-53,71,-75,8,91,46,-80,-50,-21,20,-74,58,-108,-32,-25,-37,-51,-2,-127,93,-82,-93,-16,69,46,20,10,94,-43,-99,127,-74,-84,84,0,31,-40,12,59,41,76,90,127,-58,-77,64,-46,112,83,-106,10,-29,105,-105,57,-73,-49,64,-107,-91,-62,-95,-55,109,-69,110,-94,-101,-24,-40,-92,55,-99,-43,76,28,-29,-40,-30,-22,-54,-81,15,-14,-15,112,28,108,-45,97,-50,82,28,-89,120,-50,122,117,9,-55,-105,120,74,-24,75,56,39,-2,19,-90,43,-112,56,-4,-117,51,47,16,-123,111,126,54,-53,-124,-64,-94,80,-78,-24,-122,36,90,-28,-21,-100,71,2,-60,53,-55,42,-65,-59,-64,-63,-63,98,76,-109,100,89,-74,-58,-112,-5,-84,116,-37,-105,-117,65,94,-65,-58,-117,-68,113,-49,-118,46,-88,-54,70,-53,86,72,-77,39,-82,79,-25,117,19,-46,-73,118,81,-39,-42,21,-125,52,-35,66,21,-99,-105,-60,-12,-83,84,6,121,-23,-122,-93,48,43,36,-112,-54,62,43,-91,-65,-66,-101,-125,-68,-64,111,-60,-49,-5,-1,-50,79,112,52,-33,-73,-5,-8,-105,45,-40,15,-20,91,-71,-79,-18,122,67,118,-78,49,-55,97,-10,6,-55,89,-47,-95,80,-18,-113,39,-106,-25,-121,-3,-81,-123,-3,107,-118,-86,80,50,-71,-34,99,-65,-27,11,123,-113,59,94,112,59,-101,-98,121,65,-5,-84,-43,-71,-13,38,94,-81,-61,-115,-90,25,-68,47,-63,-99,-60,-105,-102,66,-18,75,32,-13,37,-16,-4,-2,-24,55,93,-71,12,36,-82,92,122,-125,-32,108,-40,-15,120,127,116,-109,31,-94,-35,-110,12,-47,30,120,-83,-50,108,-32,127,110,24,95,6,-53,-9,-90,-3,-65,58,-97,89,-27,-121,-113,-4,-74,-84,81,86,-58,17,101,5,23,-54,33,101,85,-29,20,126,66,89,-63,-33,39,73,43,-3,-41,-92,85,-50,66,125,-98,-76,-54,57,-82,127,69,90,25,123,50,74,117,-10,100,-108,-128,-76,-86,-20,-64,72,64,90,33,70,90,53,-55,89,125,85,-54,7,-23,-4,-3,117,98,-108,-113,-125,-8,119,-27,-87,81,62,22,66,39,78,-7,-24,26,79,124,-98,26,-27,-125,-48,17,113,120,106,-108,-113,-61,111,-28,-109,-93,124,44,62,-117,78,-116,-14,113,-43,-93,26,-9,-28,40,31,75,-27,121,-73,19,-92,124,44,126,51,-97,32,-27,99,-13,-44,-37,9,82,62,38,127,36,-99,32,-27,-29,30,-47,86,95,-97,-14,41,-33,-14,-115,-110,62,-59,-77,-108,39,125,10,-23,79,73,-97,-39,-92,-50,-24,-120,56,-97,-33,-90,-123,12,83,-1,21,45,-92,33,1,115,116,-56,11,25,34,94,-56,-74,63,-59,11,-79,95,43,-72,119,-87,-50,-1,-112,-73,-69,-52,-66,-119,-95,-26,-35,-4,38,-122,-66,-119,-95,-1,27,49,4,-97,31,15,0,-88,84]";
byte[] data = sr.getBytes();
try {
String x = new String(decompress(compress(data)));
System.out.println("decompressed " + x);
} catch (IOException | DataFormatException e) {
e.printStackTrace();
}
}

public static byte[] compress(byte[] data) throws IOException {
Deflater deflater = new Deflater();
deflater.setInput(data);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(data.length);

deflater.finish();
byte[] buffer = new byte[1024];
while (!deflater.finished()) {
int count = deflater.deflate(buffer);
outputStream.write(buffer, 0, count);
}
outputStream.close();
byte[] output = outputStream.toByteArray();

System.out.println("Original: " + data.length);
System.out.println("Compressed: " + output.length);
return output;
}

public static byte[] decompress(byte[] data) throws IOException, DataFormatException {
Inflater inflater = new Inflater();
inflater.setInput(data);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(data.length);
byte[] buffer = new byte[1024];
while (!inflater.finished()) {
int count = inflater.inflate(buffer);
outputStream.write(buffer, 0, count);
}
outputStream.close();
byte[] output = outputStream.toByteArray();
System.out.println();
return output;
}
}


A sample of how the data will look like:-

"120,-100,-19,89,91,79,-21,56,16,-2,43,40,-49,104,55,113,-18,-68,-91,-23,21,104,90,-38,-62,10,-83,120,48,-83,91,34,-46,-92,-21,-92,8,-124,-8,-17,103,-100,-92,77,-22,-38,-25,112,86,27,-119,106,65,84,-86,103,-58,-10,124,-98,-15,101,-66,-66,43,25,126,-99,-112,116,-109,-60,41,81,46,-34,-107,-57,36,121,14,-29,-43,-20,109,3,77,101,-94,-100,43,120,-79,-115,50,63,-39,-58,25,8,52,16,-52,-97,-62,104,-79,19,-88,32,8,-29,37,-114,-77,-70,-92,71,-78,89,125,-36,-65,-33,-107,5,-50,-40,-120,-86,-11,39,82,-31,95,-77,-40,-48,-21,-94,15,82,13,11,-58,-115,112,-102,-6,-55,-126,-103,-7,126,-65,53,-22,-113,-64,-58,123,-63,97,52,37,-85,53,97,-106,-17,74,55,10,87,79,-39,96,-63,-100,65,76,31,-46,40,-116,73,-39,111,-38,3,81,97,18,108,-41,-113,-124,-126,-52,-48,-100,-62,-50,-89,120,-103,-107,-56,108,-99,9,71,52,92,-123,49,52,91,-41,-109,125,19,44,55,9,-51,102,-124,-82,-61,24,71,-96,5,85,-101,-92,25,-76,-78,48,-55,-51,71,-61,67,-103,-92,-49,6,-45,108,75,73,27,-80,-49,-62,53,-101,23,-64,-25,75,-96,89,103,72,-67,48,-44,11,-107,-83,-105,71,105,-8,-126,35,-119,29,-70,-48,74,-69,-10,-106,-18,92,-48,-98,104,122,-90,-85,48,93,10,-118,2,108,-78,-100,102,-55,38,85,46,-44,115,-27,46,-60,-123,23,-2,-106,82,18,-49,-33,-54,21,26,4,-109,-35,-86,-114,-15,107,23,-125,119,36,-125,70,-102,71,-55,23,-58,96,47,5,-60,-13,-61,-24,-80,-28,96,97,105,-31,52,-100,123,101,60,53,-61,112,33,12,48,54,19,-61,-56,74,-112,116,41,-127,-42,74,41,-28,-69,-4,34,-53,109,-68,-64,-113,17,1,-7,-3,77,-18,-8,44,-55,112,4,-39,-77,27,12,68,61,-102,-92,-23,126,112,-45,48,-64,-91,100,-67,14,-45,-76,88,11,-45,-2,-61,-75,-108,-113,-113,-13,67,0,-99,-114,12,64,-91,17,3,-128,-124,108,14,0,-46,28,-99,3,-32,104,66,0,-35,-82,12,64,-91,-111,0,48,-35,6,1,-40,-74,-54,1,-48,84,93,-120,-96,-33,-105,33,-88,52,98,4,-70,-34,96,8,116,-45,114,121,4,-70,24,-63,-27,-91,12,65,-91,-111,32,112,27,-116,-127,-127,44,-115,71,96,-70,66,4,87,87,50,4,-107,70,-116,-64,112,26,-116,-127,-87,89,54,-113,-64,21,-57,-32,-6,90,-122,-96,-46,-120,17,-104,77,34,-80,-112,-50,111,100,36,-55,-94,-31,80,-122,-96,-46,-120,17,-40,77,30,69,-74,-87,-15,89,-124,36,103,81,16,-56,16,84,26,49,2,71,111,112,31,56,-82,-55,-97,69,-70,110,10,17,-116,70,50,4,-107,70,-116,-64,117,26,68,-96,-87,-90,-31,-16,16,92,49,-124,-101,27,25,-124,74,35,-71,-110,-31,-38,108,16,3,-46,85,126,51,27,-106,56,-111,38,19,25,-122,74,35,-63,-48,-24,-99,-96,25,8,-103,-4,-61,66,-78,-99,-89,83,25,-122,74,35,-63,96,-94,38,115,-55,-46,85,-2,72,-78,52,113,28,102,51,25,-122,74,35,-63,96,55,-71,-93,53,-57,52,-8,45,109,-19,-10,-61,-61,-57,-71,-78,43,43,-90,25,-50,-74,48,57,-100,96,-73,41,-95,51,-118,-25,-49,121,93,48,25,-34,-113,6,-82,-83,-62,-3,91,124,116,-37,-123,67,-56,50,12,3,-23,-102,-19,-72,-106,-125,92,56,-25,-64,39,-16,99,-76,-51,54,-37,18,-28,106,-123,87,-60,-117,23,67,-126,-93,-76,27,1,-100,-36,-29,-68,-108,41,-86,-118,-78,18,41,94,-53,71,-75,8,91,46,-80,-50,-21,20,-74,58,-108,-32,-25,-37,-51,-2,-127,93,-82,-93,-16,69,46,20,10,94,-43,-99,127,-74,-84,84,0,31,-40,12,59,41,76,90,127,-58,-77,64,-46,112,83,-106,10,-29,105,-105,57,-73,-49,64,-107,-91,-62,-95,-55,109,-69,110,-94,-101,-24,-40,-92,55,-99,-43,76,28,-29,-40,-30,-22,-54,-81,15,-14,-15,112,28,108,-45,97,-50,82,28,-89,120,-50,122,117,9,-55,-105,120,74,-24,75,56,39,-2,19,-90,43,-112,56,-4,-117,51,47,16,-123,111,126,54,-53,-124,-64,-94,80,-78,-24,-122,36,90,-28,-21,-100,71,2,-60,53,-55,42,-65,-59,-64,-63,-63,98,76,-109,100,89,-74,-58,-112,-5,-84,116,-37,-105,-117,65,94,-65,-58,-117,-68,113,-49,-118,46,-88,-54,70,-53,86,72,-77,39,-82,79,-25,117,19,-46,-73,118,81,-39,-42,21,-125,52,-35,66,21,-99,-105,-60,-12,-83,84,6,121,-23,-122,-93,48,43,36,-112,-54,62,43,-91,-65,-66,-101,-125,-68,-64,111,-60,-49,-5,-1,-50,79,112,52,-33,-73,-5,-8,-105,45,-40,15,-20,91,-71,-79,-18,122,67,118,-78,49,-55,97,-10,6,-55,89,-47,-95,80,-18,-113,39,-106,-25,-121,-3,-81,-123,-3,107,-118,-86,80,50,-71,-34,99,-65,-27,11,123,-113,59,94,112,59,-101,-98,121,65,-5,-84,-43,-71,-13,38,94,-81,-61,-115,-90,25,-68,47,-63,-99,-60,-105,-102,66,-18,75,32,-13,37,-16,-4,-2,-24,55,93,-71,12,36,-82,92,122,-125,-32,108,-40,-15,120,127,116,-109,31,-94,-35,-110,12,-47,30,120,-83,-50,108,-32,127,110,24,95,6,-53,-9,-90,-3,-65,58,-97,89,-27,-121,-113,-4,-74,-84,81,86,-58,17,101,5,23,-54,33,101,85,-29,20,126,66,89,-63,-33,39,73,43,-3,-41,-92,85,-50,66,125,-98,-76,-54,57,-82,127,69,90,25,123,50,74,117,-10,100,-108,-128,-76,-86,-20,-64,72,64,90,33,70,90,53,-55,89,125,85,-54,7,-23,-4,-3,117,98,-108,-113,-125,-8,119,-27,-87,81,62,22,66,39,78,-7,-24,26,79,124,-98,26,-27,-125,-48,17,113,120,106,-108,-113,-61,111,-28,-109,-93,124,44,62,-117,78,-116,-14,113,-43,-93,26,-9,-28,40,31,75,-27,121,-73,19,-92,124,44,126,51,-97,32,-27,99,-13,-44,-37,9,82,62,38,127,36,-99,32,-27,-29,30,-47,86,95,-97,-14,41,-33,-14,-115,-110,62,-59,-77,-108,39,125,10,-23,79,73,-97,-39,-92,-50,-24,-120,56,-97,-33,-90,-123,12,83,-1,21,45,-92,33,1,115,116,-56,11,25,34,94,-56,-74,63,-59,11,-79,95,43,-72,119,-87,-50,-1,-112,-73,-69,-52,-66,-119,-95,-26,-35,-4,38,-122,-66,-119,-95,-1,27,49,4,-97,31,15,0,-88,84"


Is there a better Option for reducing the number of characters in a string without converting it to byte array and unwanted characters?

Thanks in advance,

Answer

You can compress to a byte[] and then encode the result in Base64. This will only use alphanumeric and fewer symbols which are safe for transfering as text. i.e. it is widely used for this.

public static void main(String[] args) {
    StringBuilder sb = new StringBuilder();
    while (sb.length() < 751396)
        sb.append("Size: ").append(sb.length()).append("\n");
    String s = sb.toString();

    String s2 = deflateBase64(s);
    System.out.println("Uncompressed size = " + s.length() + ", compressed size=" + s2.length());

    String s3 = inflateBase64(s2);
    System.out.println("Same after inflating is " + s3.equals(s));
}

public static String deflateBase64(String text) {
    try {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try (Writer writer = new OutputStreamWriter(new DeflaterOutputStream(baos))) {
            writer.write(text);
        }
        return Base64.getEncoder().encodeToString(baos.toByteArray());
    } catch (IOException e) {
        throw new AssertionError(e);
    }
}

public static String inflateBase64(String base64) {
    try (Reader reader = new InputStreamReader(
            new InflaterInputStream(
                    new ByteArrayInputStream(
                            Base64.getDecoder().decode(base64))))) {
        StringWriter sw = new StringWriter();
        char[] chars = new char[1024];
        for (int len; (len = reader.read(chars)) > 0; )
            sw.write(chars, 0, len);
        return sw.toString();
    } catch (IOException e) {
        throw new AssertionError(e);
    }
}

prints

Uncompressed size = 751400, compressed size=219564
Same after inflating is true