2012-04-28 14 views
1

これは、ドキュメントのコレクション内のドキュメントのTF-IDF値を計算するプログラムです。 これはうまくいきますが、 "IDF"の値を計算する際に時間がかかります(特定の用語を含む文書の数がわかりません)。Luceneを使用してTFIDFスコアを計算する

特定の用語が含まれている文書の数を調べる効率的な方法はありますか?

freq = termsFreq.getTermFrequencies(); 

terms = termsFreq.getTerms(); 

int noOfTerms = terms.length; 
score = new float[noOfTerms]; 
DefaultSimilarity simi = new DefaultSimilarity(); 

     for (i = 0; i < noOfTerms; i++) { 

      int noofDocsContainTerm = noOfDocsContainTerm(terms[i]); 
      float tf = simi.tf(freq[i]); 
      float idf = simi.idf(noofDocsContainTerm, noOfDocs); 
      score[i] = tf * idf ; 

     } 

////

public int noOfDocsContainTerm(String querystr) throws CorruptIndexException, IOException, ParseException{ 

QueryParser qp=new QueryParser(Version.LUCENE_35, "docuemnt", new StandardAnalyzer(Version.LUCENE_35)); 

Query q=qp.parse(querystr); 

int hitsPerPage = docNames.length; //minumum number or search results 
IndexSearcher searcher = new IndexSearcher(ramMemDir, true); 
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); 

searcher.search(q, collector); 

ScoreDoc[] hits = collector.topDocs().scoreDocs; 

    return hits.length; 
} 
+0

こんにちは、私は、文書のコレクション内の文書のTF-IDF値を計算するプログラムを実装しようとしています。私はそれをやって数日間立ち往生している。もしあなたが気にしないなら、あなたがそれをどうやって行ったのか分かりますか?ありがとうございました。 – fuschia

+0

@fuschia私の回答を掲載しています – Kasun

答えて

3
/* 
* To change this template, choose Tools | Templates 
* and open the template in the editor. 
*/ 



import java.io.*; 
import java.util.*; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.Token; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.WhitespaceAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.analysis.tokenattributes.TermAttribute; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.*; 
import org.apache.lucene.queryParser.ParseException; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.*; 
import org.apache.lucene.store.NIOFSDirectory; 
import org.apache.lucene.util.Version; 


/* 
* Date Author Changes April 14, 2012 Kasun Perera Created 
*/ 

/* 
* 
* Class contains methods for indexing documents with Lucene, and calculating 
* TFIDF weights 
*/ 
public class DocIndexer { 

private String docNames[]; 
private String docIDS[]; 
private String pathToIndex; 
private String pathToDocumentCollection; 
private String fiboTermList[]; //marked up fibo terms 
private String taxoTermList[]; // marked up taxonomy terms 
private RAMDirectory ramMemDir; 
private String fileNames[]; 
private byte files[][]; 
private String filesInText[]; 
int noOfWordsOfDOc[]; 
int noOfSentencesOfDoc[]; 
ArrayList<String> ArrLstSentencesOfDoc[]; 
String removedTermsOfDOc[][]; 
int freqAfterRemovalOfDoc[][]; 
//int queryDocIndex ; 
private int curDocNo; 
private final int maxTerms = 1000000; 




/** 
* Constructor used when indexing directory is a RAM memory directory, We 
* need RAM directory because Stratoes Server dosen't allow access local 
* files 
* 
* @param pathToIndex- doc index path 
* @param pathToDocumentCollection - doccollection path 
*/ 
public DocIndexer(String pathToIndex, String pathToDocumentCollection) { 
    // this.docNames = docNames; 

    //this.bufPathToIndex= new RandomAccessBuffer() ; 
    // this.ramMemDir = new RAMDirectory(); 
    this.pathToIndex = pathToIndex; 
    this.pathToDocumentCollection= pathToDocumentCollection; 
    // this.files = files; 
    // this.filesInText = docContent; 

} 




/** 
* Count the number of words in a given String 
* 
* @param line- Input String 
* @return - number of words in the input String 
*/ 
private int wordCount(String line) { 
    int numWords = 0; 
    int index = 0; 
    boolean prevWhiteSpace = true; 
    while (index < line.length()) { 
     char c = line.charAt(index++); 
     boolean currWhiteSpace = Character.isWhitespace(c); 
     if (prevWhiteSpace && !currWhiteSpace) { 
      numWords++; 
     } 
     prevWhiteSpace = currWhiteSpace; 
    } 
    return numWords; 
} 

/* 
*given it's URL this methods read the text files 
*/ 
public static String fileReader(String filename) throws IOException { 

    String filetext = null; 
    BufferedReader reader = null; 
    //BufferedReader namesReader; //reader for followers 
    //Extractor extractor = new Extractor(); 
    File inFile = new File(filename); 
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]); 

    //READING FROM USERS FILE 
    reader = new BufferedReader(new FileReader(inFile)); 
    String line = null; 

    int numLine = 0; 

    while ((line = reader.readLine()) != null) { 
     // numLine++; 
     filetext = filetext + " " + line; 

     // System.out.println(line); 
    } 

    reader.close(); 
    return filetext; 
} 

/** 
* Method to index the documents only using the content of the document 
* "docid" field is used for indexing, since Lucene Dosen't retrieve the 
* documents in the indexed order 
* 
* @param docNo- document number of the document to be indexed 
* @throws IOException 
*/ 
public void indexDocs() throws IOException { 
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\"; 
    // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs"; 
    File folder = new File(pathToDocumentCollection); 
    File[] listOfFiles = folder.listFiles(); 
    int noOfFiles = listOfFiles.length; 
    System.out.println("Number of files : " + noOfFiles); 

    IndexWriter iW; 
    int indexDocCount = 0; 
    try { 
     NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex)); 
     iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36))); 

     for (int i = 0; i < noOfFiles; i++) { 
      if (listOfFiles[i].isFile()) { 
       String docName = listOfFiles[i].getName(); 
       System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length()); 
       if (listOfFiles[i].length() > 1) { 
        String filesInText = fileReader(pathToDocumentCollection + docName); 

        //docIds[i] = docNames[i].substring(0, docName.length() - 4); 
        System.out.println("Added to index : " + docName); 

        // StringReader strRdElt = new StringReader(filesInText[i]); 
        //filesInText = filesInText.replaceAll("[^A-Za-z_]", " "); 
        //System.out.println("Added to index : " + docName); 
        StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", "")); 
        StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here 

        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); 

        doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES)); 
        doc.add(new Field("docid", docId, Field.TermVector.YES)); 
        iW.addDocument(doc); 
        indexDocCount++; 
       } 
      } 
     } 

     System.out.println("no of documents added to index : " + indexDocCount); 

     iW.close(); 
     // dir.close() ; 
    } catch (CorruptIndexException e) { 
     e.printStackTrace(); 
    } catch (IOException e) { 
     e.printStackTrace(); 
    } 
} 



/** 
* This method calculates the TF-IDF score for each terms in the indexed 
* documents 
* 
* @param numberOfDocs 
* @return - Hashmap of TF-IDF score per each term in document wise 
* @throws CorruptIndexException 
* @throws ParseException 
*/ 
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException { 

    int noOfDocs = docNames.length; 

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>(); 
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>(); 


    try { 

     IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ; 
     // IndexReader re = IndexReader.open(ramMemDir); 

     int i = 0; 
     for (int k = 0; k < numberOfDocs; k++) { 
      int freq[]; 
      TermFreqVector termsFreq; 
      TermFreqVector termsFreqDocId; 
      //TermFreqVector termsFreq3[]; 
      HashMap<String, Float> wordMap = new HashMap<String, Float>(); 
      String terms[]; 
      float score[] = null; 

      //termsFreq3=re.getTermFreqVectors(currentDocID); 
      termsFreq = re.getTermFreqVector(k, "doccontent"); 
      termsFreqDocId = re.getTermFreqVector(k, "docid"); 

      int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]); 
      freq = termsFreq.getTermFrequencies(); 

      terms = termsFreq.getTerms(); 

      int noOfTerms = terms.length; 
      score = new float[noOfTerms]; 
      DefaultSimilarity simi = new DefaultSimilarity(); 
      for (i = 0; i < noOfTerms; i++) { 
       int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i])); 
       // System.out.println(terms[i]+"\t"+freq[i]); 
       //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames"); 
       float tf = simi.tf(freq[i]); 
       float idf = simi.idf(noofDocsContainTerm, noOfDocs); 
       wordMap.put(terms[i], (tf * idf)); 

      } 
      scoreMap.put(aInt, wordMap); 
     } 


    } catch (IOException e) { 
     // score = null; 
     e.printStackTrace(); 
    } 



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap; 
} 


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException { 
    int noOfDocs = docNames.length; 
    float tfIdfScore[][] = new float[noOfDocs][]; 
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>(); 
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>(); 


    scoreMap = tfIdfScore(noOfDocs); 




    return scoreMap; 
} 

}

6

あなたは言葉を持っているし、その文書頻度をしたい場合、すなわち、この用語が含まれている文書の数:IndexReader.termEnum(Term)メソッドを呼び出します。 TermEnumオブジェクトを提供します。次にTermEnum.docFreq()メソッドに呼び出します。これは、索引での用語の文書頻度を示します。

関連する問題