Ram Ki Ram Ki - 3 months ago 15
Java Question

How to read duplicate words count from a directory or a folder

I got this below program from an coding site.

The following code read text file and find duplicate words.

To read from each text files and display it's duplicate words count line by line.
And how to call that files if it is not stored as String, I used buffered reader but I am not getting my output.

My questions:


  1. How can I make the program read multiple files from given folder?

  2. How to save the results in Excel file format?



Any suggestions Welcomed.

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;


public class MaxDuplicateWordCount {

public Map<String, Integer> getWordCount(String fileName){

FileInputStream fis = null;
DataInputStream dis = null;
BufferedReader br = null;
Map<String, Integer> wordMap = new HashMap<String, Integer>();

try {
fis = new FileInputStream(fileName);
dis = new DataInputStream(fis);
br = new BufferedReader(new InputStreamReader(dis));
String line = null;
while((line = br.readLine()) != null){
StringTokenizer st = new StringTokenizer(line, " ");
while(st.hasMoreTokens()){
String tmp = st.nextToken().toLowerCase();
if(wordMap.containsKey(tmp)){
wordMap.put(tmp, wordMap.get(tmp)+1);
} else {
wordMap.put(tmp, 1);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
try{if(br != null) br.close();}catch(Exception ex){}
}
return wordMap;
}

public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap){

Set<Entry<String, Integer>> set = wordMap.entrySet();
List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
Collections.sort( list, new Comparator<Map.Entry<String, Integer>>()
{
public int compare( Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2 )
{
return (o2.getValue()).compareTo( o1.getValue() );
}
} );
return list;
}

public static void main(String a[]){



MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
Map<String, Integer> wordMap = mdc.getWordCount("E:\\Blog 39.txt");

List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
for(Map.Entry<String, Integer> entry:list){
System.out.println(entry.getKey()+" ="+entry.getValue());
}
}
}

Answer

Intro

After chatting with OP, here is briefly what OP requires:

1- Read file/s from specific folder, files are typically Unicode as text files.
2- The files will be process in OP Algorithm in the Question, and the results of the Algorithm should be saved on Unicode file again (Later OP asked to be saved as Excel file (.XLS) because of Unicode compatibility with Excel)

Solution

This can be solved in following steps:

step 1 We define (declare) our work-space
step 2 We create output folder in work-space if not exist
step 3 We read all existing files in work-space folder and process them in the Algorithm.
step 4 The results of each file will saved as Excel file in output folder.

The code

First of all you need to import POI package, this will allow you to create XLS sheet. I have downloaded this poi/poi-3.5-FINAL.jar.zip( 1,372 k) and the following imports should added to your code.

import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;

Next you added following code to your code, it is self explainable code:

final static String WORKSPACE = "C:/testfolder/";

private static void createOutputFolder(String outputFolderName) {
    File outputDirectory = new File(WORKSPACE + outputFolderName);

    if (!outputDirectory.exists()) {
        try {
            outputDirectory.mkdir();
        } catch (Exception e) {
        }
    }
}

private static void exlCreator() {

    String outputFolder = "output/";
    String fileName, fileNameWPathInput;
    int serialNumber = 1;
    createOutputFolder(outputFolder);

    MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
    File folder = new File(WORKSPACE);
    File[] listOfFiles = folder.listFiles();

    for (int i = 0; i < listOfFiles.length; i++) {
        if (listOfFiles[i].isFile()) {
            fileName = listOfFiles[i].getName();
            fileNameWPathInput = WORKSPACE + fileName;
            Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
            List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
            String fileNameWPathOutput = WORKSPACE + outputFolder +
                    fileName.substring(0, fileName.length() - 4)
                    + "output.xls";
            try {
                HSSFWorkbook workbook = new HSSFWorkbook();
                HSSFSheet sheet = workbook.createSheet("ResultSheet");

                HSSFRow rowhead = sheet.createRow((short) 0);
                rowhead.createCell(0).setCellValue("Serial No.");
                rowhead.createCell(1).setCellValue("Word");
                rowhead.createCell(2).setCellValue("Count");

                for (Map.Entry<String, Integer> entry : list) {
                    HSSFRow row = sheet.createRow((short) serialNumber);
                    row.createCell(0).setCellValue(serialNumber);
                    row.createCell(1).setCellValue(entry.getKey());
                    row.createCell(2).setCellValue(entry.getValue());
                    serialNumber++;
                }
                FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
                workbook.write(fileOut);
                fileOut.close();
                serialNumber = 1;
                System.out.println(fileNameWPathOutput + " is created");

            } catch (Exception ex) {
                System.out.println(ex);
            }
        }
    }


}

public static void main(String [] args) throws IOException {
    exlCreator();
}

Finally

By manipulating the code, it is possible to create one output file but create each output results in work sheets. As you can see in the image below, the output file is opened in Excel showing Unicode text with out problem, as it was the issue in my first solution: enter image description here

Links

Download POI
POI documentation
Unicode problem in CSV
More about CSV

Full code, requested from OP

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
//for Excel ark
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;

public class MaxDuplicateWordCount {

    public Map<String, Integer> getWordCount(String fileName) {

        FileInputStream fis = null;
        DataInputStream dis = null;
        BufferedReader br = null;
        Map<String, Integer> wordMap = new HashMap<String, Integer>();

        try {
            fis = new FileInputStream(fileName);
            dis = new DataInputStream(fis);
            br = new BufferedReader(new InputStreamReader(dis));
            String line = null;
            while ((line = br.readLine()) != null) {
                StringTokenizer st = new StringTokenizer(line, " ");
                while (st.hasMoreTokens()) {
                    String tmp = st.nextToken().toLowerCase();
                    if (wordMap.containsKey(tmp)) {
                        wordMap.put(tmp, wordMap.get(tmp) + 1);
                    } else {
                        wordMap.put(tmp, 1);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (br != null) br.close();
            } catch (Exception ex) {
            }
        }
        return wordMap;
    }

    public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap) {

        Set<Entry<String, Integer>> set = wordMap.entrySet();
        List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {

            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {

                return (o2.getValue()).compareTo(o1.getValue());
            }


        });
        return list;
    }

    final static String WORKSPACE = "C:/testfolder/";

    private static void createOutputFolder(String outputFolderName) {
        File outputDirectory = new File(WORKSPACE + outputFolderName);

        if (!outputDirectory.exists()) {
            try {
                outputDirectory.mkdir();
            } catch (Exception e) {
            }
        }
    }

    private static void exlCreator() {

        String outputFolder = "output/";
        String fileName, fileNameWPathInput;
        int serialNumber = 1;
        createOutputFolder(outputFolder);

        MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
        File folder = new File(WORKSPACE);
        File[] listOfFiles = folder.listFiles();

        for (int i = 0; i < listOfFiles.length; i++) {
            if (listOfFiles[i].isFile()) {
                fileName = listOfFiles[i].getName();
                fileNameWPathInput = WORKSPACE + fileName;
                Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
                List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
                String fileNameWPathOutput = WORKSPACE + outputFolder +
                        fileName.substring(0, fileName.length() - 4)
                        + "output.xls";
                try {
                    HSSFWorkbook workbook = new HSSFWorkbook();
                    HSSFSheet sheet = workbook.createSheet("ResultSheet");

                    HSSFRow rowhead = sheet.createRow((short) 0);
                    rowhead.createCell(0).setCellValue("Serial No.");
                    rowhead.createCell(1).setCellValue("Word");
                    rowhead.createCell(2).setCellValue("Count");

                    for (Map.Entry<String, Integer> entry : list) {
                        HSSFRow row = sheet.createRow((short) serialNumber);
                        row.createCell(0).setCellValue(serialNumber);
                        row.createCell(1).setCellValue(entry.getKey());
                        row.createCell(2).setCellValue(entry.getValue());
                        serialNumber++;
                    }
                    FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
                    workbook.write(fileOut);
                    fileOut.close();
                    serialNumber = 1;
                    System.out.println(fileNameWPathOutput + " is created");

                } catch (Exception ex) {
                    System.out.println(ex);
                }
            }
        }


    }

    public static void main(String[] args) throws IOException {
        exlCreator();
    }
}
Comments