JJKx JJKx - 4 months ago 58
Java Question

combine two methods in Java together in this Java code

I want combine the two methods in my document parser, frequencyCounter and parseFiles. , one will read 5 textfiles of html source document and then count the keyword frequencies of 5 keywords.
I
found out that my output of keywords frequency counter is incorrect. I have two methods, parseFiles which loop over each file in the directory and reads some information from it. frequencyCounter is supposed to print information from each while as it was parsed to get token count and frequency information.

I want all of frequencyCounter should be a function that should be executed from within parseFiles, and relevant information about the file's content should be passed to doSomething so that it knows what to print.

Right now I'm just keep messing up on how to put these two methods together, please give some advices.

Correct format out of output should be something like this:

Doc1 Doc2 Doc3 Doc4 Doc5

Doc1 1 0.78
Doc2 0.57
Doc3
Doc4
Doc5


Engineering Research Data Minining Professor

Doc1 23 644
Doc2 457
Doc3
Doc4
Doc5


this is my main class:

import java.io.FileNotFoundException;
import java.io.IOException;
public class TfIdfMain {

private static String[] files = {"doc1.txt", "doc2.txt", "doc3.txt" , "doc4.txt", "doc5.txt"};
public static void main(String args[]) throws FileNotFoundException, IOException {
DocumentParser dp = new DocumentParser();
dp.parseFiles("C:\\Users\\dachen\\Documents");
dp.tfIdfCalculator();
dp.frequencyCounter();
dp.getCosineMatrix();
}
}


This is my parser class:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Arrays;

public class DocumentParser {

public void frequencyCounter(){
String text = "Professor, engineering, data, mining, research";
StringTokenizer str = new StringTokenizer(text);
String word[] = new String[10];
String unique[] = new String[10];
String x;
int count = -1;
while (str.hasMoreTokens()) {
count++;
x = str.nextToken();
word[count] = x;
System.out.println(count + ": " + word[count]);
}

System.out.println("---Keywords Frequency---");
for (int i = 0; i < 7; i++) {

if ((!Arrays.asList(unique).contains(word[i]))) {
unique[i] = word[i];
}
}

int[] measure = new int[10];

for (int a = 0; a < 7; a++) {
if (Arrays.asList(unique).contains(word[a])) {
measure[a] += 1;
System.out.println(unique[a] + " : " + measure[a]);
}
}
}

private List<String[]> termsDocsArray = new ArrayList<String[]>();
private List<String> allTerms = new ArrayList<String>();
private List<double[]> tfidfDocsVector = new ArrayList<double[]>();

public void parseFiles(String filePath) throws FileNotFoundException, IOException {
File[] allfiles = new File(filePath).listFiles();
BufferedReader in = null;
for (File f : allfiles) {
if (f.getName().endsWith(".txt")) {
in = new BufferedReader(new FileReader(f));
StringBuilder sb = new StringBuilder();
String s = null;
while ((s = in.readLine()) != null) {
sb.append(s);
}
String[] tokenizedTerms = sb.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); //to get individual terms
for (String term : tokenizedTerms) {
if (!allTerms.contains(term)) {
allTerms.add(term);
}
}
termsDocsArray.add(tokenizedTerms);
}
}

}

public void tfIdfCalculator() {
double tf;
double idf;
double tfidf;
for (String[] docTermsArray : termsDocsArray) {
double[] tfidfvectors = new double[allTerms.size()];
int count = 0;
for (String terms : allTerms) {
tf = new TfIdf().getTf(docTermsArray, terms);
idf = new TfIdf().idfCalculation(termsDocsArray, terms);
tfidf = tf * idf;
tfidfvectors[count] = tfidf;
count++;
}
tfidfDocsVector.add(tfidfvectors);
}
}
public void getCosineMatrix() {
for (int i = 0; i < tfidfDocsVector.size(); i++) {
for (int j = 0; j < tfidfDocsVector.size(); j++) {
System.out.println("between " + i + " and " + j + " = "
+ new CosineSimilarity().getCosine
(
tfidfDocsVector.get(i),
tfidfDocsVector.get(j)
)
);
}
}
}
}

Answer

The code below gives you this output:

Professor frequency: 54
engineering frequency: 188
data frequency: 2
mining frequency: 2
research frequency: 9

Though this is only for doc1, you've to add a loop to iterate on all the 5 documents.

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Scanner;

public class yolo {
    public static void frodo() throws Exception {

        String[] keywords = { "Professor" , "engineering" , "data" , "mining" , "research"};
        for(int i=0; i< keywords.length; i++){

        String testWord = keywords[i];
        File document = new File("path//to//doc1.txt");
        boolean check = true;

        try {
            FileInputStream fstream = new FileInputStream(document);
            BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
            String strLine;
            strLine = br.readLine();

            // Read File Line By Line

            int count = 0;
            while ((strLine = br.readLine()) != null) {

                // check to see whether testWord occurs at least once in the
                // line of text
                check = strLine.toLowerCase().contains(testWord.toLowerCase());

                if (check) {
                    // get the line
                    String[] lineWords = strLine.split("\\s+");
                    count++;
                }

            }
            System.out.println(testWord + "frequency: " + count);

            br.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

}

hope this helps!

Comments