2012-02-10 6 views
0

Lucene.Netにxより大きい長さの用語のみをインデックスするように制限する方法を教えてください。 ように私は、ドキュメントのインデックスを作成しています:私はLuceneのインデックスファイルからインデックス付きの規約を取得するには、次のコードを使用していますLucene.NetのIndex Term Lengthを設定する方法

 String indexDirectory = @"C:\Users\user\Desktop\Index"; 
     String dataDirectory = @"C:\Users\user\Desktop\Data"; 


     StandardAnalyzer analyzer = new StandardAnalyzer(); 
     IndexWriter writer = new IndexWriter(indexDirectory, analyzer); 

     Document doc = new Document(); 

     Field fPath = new Lucene.Net.Documents.Field("path", dataDirectory, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO); 
     Field fContent = new Field("content", ReadTextFile(dataDirectory), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES); 

     doc.Add(fPath); 
     doc.Add(fContent); 

 TermFreqVector[] vectors = IndexReader.Open(indexDirectory).GetTermFreqVectors(0); 

     foreach (Lucene.Net.Index.TermFreqVector vector in vectors) 
     { 
      String[] terms = vector.GetTerms(); 

      foreach (String term in terms) 
      { 
       // loop through indexed terms 
      } 

     } 

答えて

2

独自のアナライザを実装したり、StandardAnalyzerを拡張したりすることができます。

例:

TokenFilter +アナライザ

public class MinTermLengthTokenFilter : TokenFilter 
{ 
    private int minTermLength; 
    private TermAttribute termAtt; 
    public MinTermLengthTokenFilter(int maxTermLength, TokenStream input) 
     : base(input) 
    { 
     this.minTermLength = maxTermLength; 
     termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); 
    } 

    public override bool IncrementToken() 
    { 
     while (input.IncrementToken()) 
     { 
      if (termAtt.TermLength() >= minTermLength) 
      { 
       return true; 
      } 
     } 
     return false; 
    }   

} 


public class MinTermLengthAnalyzer : StandardAnalyzer 
{ 
    private int minTermLength; 
    public MinTermLengthAnalyzer(int minTermLength) 
     :base() 
    { 
     this.minTermLength = minTermLength; 
    } 

    public override TokenStream TokenStream(string fieldName, TextReader reader) 
    { 
     return new MinTermLengthTokenFilter(minTermLength, base.TokenStream(fieldName, reader)); 
    } 

    public override TokenStream ReusableTokenStream(string fieldName, TextReader reader) 
    { 
     return new MinTermLengthTokenFilter(minTermLength, base.ReusableTokenStream(fieldName, reader)); 

    } 
} 

インデックス:

FSDirectory dir = FSDirectory.GetDirectory("C:\\temp\\CFSTEST"); 
IndexWriter writer = new IndexWriter(dir, new MinTermLengthAnalyzer(5)); 
Document document = new Document(); 

document.Add(new Field(
    "text", 
    "some sample text for demonstration", 
    Field.Store.YES, 
    Field.Index.ANALYZED, 
    Field.TermVector.WITH_POSITIONS_OFFSETS)); 
writer.AddDocument(document);  
writer.Close(); 

探索:

 var indexSearcher = new IndexSearcher(IndexReader.Open("C:\\temp\\CFSTEST")); 

     var results = indexSearcher.Search(new TermQuery(new Term("text", "demonstration")), null, 25); 

     foreach (var result in results.ScoreDocs) 
     { 
      TermFreqVector[] vectors = indexSearcher.GetIndexReader().GetTermFreqVectors(result.doc); 

      foreach (Lucene.Net.Index.TermFreqVector vector in vectors) 
      { 
       String[] terms = vector.GetTerms(); 

       foreach (String term in terms) 
       { 
        Console.WriteLine(term); 
       } 

      } 
     } 

     indexSearcher.Close(); 
     // outputs: 
     // demonstration 
     // sample 
関連する問題