2012-02-05 12 views

答えて

5

ソースを開いていますが、the ReadabilityStatistic interfaceを使用してWordに委任することができません。文書がWordで始まっていない場合でも、Wordを開いて(ユーザーには見えないように)、Wordにテキストをダンプしてから、ReadabilityStatisticを使用して統計を計算することができます。

0

私はこのためのライブラリがないのに驚いていますが、本当に必要ですか?

生のテキストを得ることができれば、計算はかなり簡単です。

this(PHP)の音節を数えることは、regexpsを使用して文を数えるのと同じように行われますが、分割する代わりに! aeiouyすべての母音を分割します。

+1

非常にラフな仮定です:代わりにすべてのそれらのもので

module TextNormalizer open System; open System.Collections.Generic; open System.Linq; open System.Text.RegularExpressions; let spaceRegex = new Regex(@"\s+"); let normalizeTextRegexStrict = new Regex(String.Join("|", [| @"[^\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled); let normalizeTextRegexApostrophe = new Regex(String.Join("|", [| @"[^'\w\s]"; @"[0-9]+"; "_" |]), RegexOptions.Compiled); /// <summary> /// Replaces all punctuation with whitspace, apostrophe optional. Will return string matching original text with punctuation /// removed, text lowercased, and words evenly delimited with whitespace /// </summary> /// <param name="normedLine"></param> /// <param name="removeApostrophe"></param> let Normalize(normedLine) (removeApostrophe) = let normedLine = if removeApostrophe then normalizeTextRegexStrict.Replace(normedLine, " "); // replace all punctuation with whitespace else normalizeTextRegexApostrophe.Replace(normedLine, " "); // replace all except apostrophe with whitespace //return spaceRegex.Replace(normedLine, " ") // reduce continguous whitespace to a single space .Trim() // get rid of any whitespace on ends .ToLower(); // lowercase whole thing 

FKを計算するのは簡単ですなぜか(またはこの文の複数母音の言葉を見てください:-)の詳細については、http://stackoverflow.com/a/1076924/1226839を参照してください。 – Nathan

1

フレッシュ・キンケイドの学年レベルの式で説明したように:

https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

あなたは単語、文章、および音節をカウントする必要があります。文章にもいくつかの考えが必要ですが、音節は恐らく最も難しいでしょう。

ここでは、F#(これは.NETですが、Visual StudioでF#プロジェクトを作成してC#プロジェクトからそのプロジェクトを参照することができます)への音節の2つの翻訳です。私は基本的ではあるがこれについての広範なテストはしていない。

私は、Ipeirotisが子供よりも(問題の単語リストを追加した)いくつかのテストケースでより良い結果を得ていることがわかりました。私のテストワードは以下の通りです:

let testWords = [|"abalone";"gracious";"atheism";"unaware"; "seaside";"underwater";"wonderwoman";"biology"|] 

子供のコードは、特にリストの最後に問題があります。正規表現を最長の接辞から最短の順に並べ替えることは、それを解決していないようです。

私の翻訳:

module Readability 

open System.Text.RegularExpressions 
//for syllables 
//simpler: 
//https://github.com/ipeirotis/ReadabilityMetrics/blob/master/src/main/java/com/ipeirotis/readability/engine/Syllabify.java 

let SyllableCount2 (word:string) = 
    let SubSyl = [| "cial"; "tia"; "cius"; "cious"; "giu"; "ion"; "iou"; "sia$"; ".ely$" |] 
    let AddSyl = [| "ia"; "riet"; "dien"; "iu"; "io"; "ii"; "[aeiouym]bl$"; "[aeiou]{3}"; "^mc"; "ism$"; "[^aeiouy][^aeiouy]l$"; "[^l]lien"; "^coa[dglx]."; "[^gq]ua[^auieo]"; "dnt$" |] 

    let mutable tempWord = word.ToLower() 
    tempWord <- tempWord.Replace("'", " ") 

    if problemWordMap.ContainsKey(word) then 
     problemWordMap.[word] 
    else if tempWord = "i" || tempWord = "a" then 
     1 
    else 
     if tempWord.EndsWith("e") then 
      tempWord <- tempWord.Substring(0, tempWord.Length - 1) 

     let phonems = Regex.Split(tempWord, "[^aeiouy]+") 

     let mutable syl = 0; 

     for i = 0 to SubSyl.Length - 1 do 
      let syllabe = SubSyl.[i]; 
      if Regex.IsMatch(tempWord, syllabe) then 
       syl <- syl - 1 

     for i = 0 to AddSyl.Length - 1 do 
      let syllabe = AddSyl.[i]; 
      if Regex.IsMatch(tempWord, syllabe) then 
       syl <- syl + 1 

     if tempWord.Length = 1 then 
      syl <- syl + 1 

     for i = 0 to phonems.Length - 1 do 
      if phonems.[i].Length > 0 then 
       syl <- syl + 1 

     if syl = 0 then 
      syl <- 1 

     // return 
     syl 

//https://github.com/DaveChild/Text-Statistics/blob/master/src/DaveChild/TextStatistics/Syllables.php 

let problemWordMap = 
    dict[ 
     ("abalone", 4); 
     ("abare", 3); 
     ("abed" , 2); 
     ("abruzzese", 4); 
     ("abbruzzese" , 4); 
     ("aborigine", 5); 
     ("aborigines", 5); //andrew plural (ap) 
     ("acreage", 3); 
     ("acreage", 3); //ap 
     ("adame", 3); 
     ("adieu", 2); 
     ("adobe", 3); 
     ("anemone", 4); 
     ("anemones", 4); //ap 
     ("apache" , 3); 
     ("apaches" , 3); //ap 
     ("aphrodite", 4); 
     ("apostrophe" , 4); 
     ("apostrophes" , 4); //ap 
     ("ariadne", 4); 
     ("cafe" , 2); 
     ("cafes" , 2); //ap 
     ("calliope" , 4); 
     ("catastrophe", 4); 
     ("catastrophes", 4); //ap 
     ("chile", 2); 
     ("chiles", 2); //ap 
     ("chloe", 2); 
     ("circe", 2); 
     ("coyote" , 3); 
     ("coyotes" , 3); //ap 
     ("epitome", 4); 
     ("forever", 3); 
     ("gethsemane" , 4); 
     ("guacamole", 4); 
     ("guacamoles", 4); //ap 
     ("hyperbole", 4); 
     ("hyperboles", 4); //ap 
     ("jesse", 2); 
     ("jukebox", 2); 
     ("jukeboxes", 2); //ap 
     ("karate" , 3); 
     ("karates" , 3); //ap 
     ("machete", 3); 
     ("maybe", 2); 
     ("people" , 2); 
     ("recipe" , 3); 
     ("sesame" , 3); 
     ("shoreline", 2); 
     ("simile" , 3); 
     ("machetes", 3); //ap 
     ("maybes", 2);//ap 
     ("peoples" , 2);//ap 
     ("recipes" , 3);//ap 
     ("sesames" , 3);//ap 
     ("shorelines", 2);//ap 
     ("similes" , 3);//ap 
     ("syncope", 3); 
     ("tamale" , 3); 
     ("tamales" , 3); //ap 
     ("yosemite" , 4); 
     ("daphne" , 2); 
     ("eurydice" , 4); 
     ("euterpe", 3); 
     ("hermione" , 4); 
     ("penelope" , 4); 
     ("persephone" , 4); 
     ("phoebe" , 2); 
     ("zoe", 2); 
    ] 

// These syllables would be counted as two but should be one 
let oneSyllableCorrection = 
    [| 
     "cia(l|$)"; // glacial, acacia 
     "tia"; 
     "cius"; 
     "cious"; 
     "[^aeiou]giu"; 
     "[aeiouy][^aeiouy]ion"; 
     "iou"; 
     "sia$"; 
     "eous$"; 
     "[oa]gue$"; 
     ".[^aeiuoycgltdb]{2,}ed$"; 
     ".ely$"; 
     //"[cg]h?ed?$"; 
     //"rved?$"; 
     //"[aeiouy][dt]es?$"; 
     //"^[dr]e[aeiou][^aeiou]+$"; // Sorts out deal, deign etc 
     //"[aeiouy]rse$"; // Purse, hearse 
     "^jua"; 
     //"nne[ds]?$"; // canadienne 
     "uai"; // acquainted 
     "eau"; // champeau 
     //"pagne[ds]?$"; // champagne 
     //"[aeiouy][^aeiuoytdbcgrnzs]h?e[rsd]?$"; 
     // The following detects words ending with a soft e ending. Don";t 
     // mess with it unless you absolutely have to! The following 
     // is a list of words you can use to test a new version of 
     // this rule (add ";r";, ";s"; and ";d"; where possible to test 
     // fully): 
     // - absolve 
     // - acquiesce 
     // - audience 
     // - ache 
     // - acquire 
     // - brunelle 
     // - byrne 
     // - canadienne 
     // - coughed 
     // - curved 
     // - champagne 
     // - designate 
     // - force 
     // - lace 
     // - late 
     // - lathe 
     // - make 
     // - relayed 
     // - scrounge 
     // - side 
     // - sideline 
     // - some 
     // - wide 
     // - taste 
     "[aeiouy](b|c|ch|d|dg|f|g|gh|gn|k|l|ll|lv|m|mm|n|nc|ng|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y|z)e$"; 
     // For soft e endings with a "d". Test words: 
     // - crunched 
     // - forced 
     // - hated 
     // - sided 
     // - sidelined 
     // - unexploded 
     // - unexplored 
     // - scrounged 
     // - squelched 
     // - forced 
     "[aeiouy](b|c|ch|dg|f|g|gh|gn|k|l|lch|ll|lv|m|mm|n|nc|ng|nch|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|th|v|y|z)ed$"; 
     // For soft e endings with a "s". Test words: 
     // - absences 
     // - accomplices 
     // - acknowledges 
     // - advantages 
     // - byrnes 
     // - crunches 
     // - forces 
     // - scrounges 
     // - squelches 
     "[aeiouy](b|ch|d|f|gh|gn|k|l|lch|ll|lv|m|mm|n|nch|nn|p|r|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y)es$"; 
     "^busi$"; 
    |] |> String.concat("|") |> Regex 


// These syllables would be counted as one but should be two 
let twoSyllableCorrection = 
    [| 
     "([^s]|^)ia"; 
     "riet"; 
     "dien"; // audience 
     "iu"; 
     "io"; 
     "eo($|[b-df-hj-np-tv-z])"; 
     "ii"; 
     "[ou]a$"; 
     "[aeiouym]bl$"; 
     "[aeiou]{3}"; 
     "[aeiou]y[aeiou]"; 
     "^mc"; 
     "ism$"; 
     "asm$"; 
     "thm$"; 
     "([^aeiouy])\1l$"; 
     "[^l]lien"; 
     "^coa[dglx]."; 
     "[^gq]ua[^auieo]"; 
     "dnt$"; 
     "uity$"; 
     "[^aeiouy]ie(r|st|t)$"; 
     "eings?$"; 
     "[aeiouy]sh?e[rsd]$"; 
     "iell"; 
     "dea$"; 
     "real"; // real, cereal 
     "[^aeiou]y[ae]"; // bryan, byerley 
     "gean$"; // aegean 
     "uen"; // influence, affluence 

    |] |> String.concat("|") |> Regex 

// Single syllable prefixes and suffixes 
let oneSyllableAffix = 
    [| 
     "^un"; 
     "^fore"; 
     "^ware"; 
     "^none?"; 
     "^out"; 
     "^post"; 
     "^sub"; 
     "^pre"; 
     "^pro"; 
     "^dis"; 
     "^side"; 
     "ly$"; 
     "less$"; 
     "some$"; 
     "ful$"; 
     "ers?$"; 
     "ness$"; 
     "cians?$"; 
     "ments?$"; 
     "ettes?$"; 
     "villes?$"; 
     "ships?$"; 
     "sides?$"; 
     "ports?$"; 
     "shires?$"; 
     "tion(ed)?$"; 

    |] |> String.concat("|") |> Regex 

// Double syllable prefixes and suffixes 
let twoSyllableAffix = 
    [| 
     "^above"; 
     "^ant[ie]"; 
     "^counter"; 
     "^hyper"; 
     "^afore"; 
     "^agri"; 
     "^in[ft]ra"; 
     "^inter"; 
     "^over"; 
     "^semi"; 
     "^ultra"; 
     "^under"; 
     "^extra"; 
     "^dia"; 
     "^micro"; 
     "^mega"; 
     "^kilo"; 
     "^pico"; 
     "^nano"; 
     "^macro"; 
     "berry$"; 
     "woman$"; 
     "women$"; 

    |] |> String.concat("|") |> Regex 

// Triple syllable prefixes and suffixes 
let threeSyllableAffix = 
    [| 
     "ology$"; 
     "ologist$"; 
     "onomy$"; 
     "onomist$"; 
    |] |> String.concat("|") |> Regex 

/// <summary> 
/// For each match in pattern, replace match with empty string in input word, 
/// returning bare word and # matches 
/// </summary> 
/// <param name="pattern"></param> 
/// <param name="word"></param> 
let RegexReplace (regex:Regex) word = 
    //let affixReplace = new Regex(pattern) 
    let matches = regex.Matches(word) 
    let mutable bareWord = word 
    for aMatch in matches do 
     bareWord <- bareWord.Replace(aMatch.Value,"") 
    // 
    bareWord, matches.Count //need to exclude a group? 

let CountMatches (regex:Regex) word = 
    //let regex = new Regex(pattern) 
    let matches = regex.Matches(word) 
    // 
    matches.Count 

/// <summary> 
/// Counts syllables in word. Assumes word has already been "cleaned" 
/// </summary> 
/// <param name="word"></param> 
let SyllableCount(word : string) = 
    if problemWordMap.ContainsKey(word) then 
     problemWordMap.[word] 
    else 
     //remove and count affixes 
     let wordMinus1Affix, oneAffixCount = RegexReplace oneSyllableAffix word 
     let wordMinus2Affix, twoAffixCount = RegexReplace twoSyllableAffix wordMinus1Affix 
     let wordMinus3Affix, threeAffixCount = RegexReplace threeSyllableAffix wordMinus2Affix 

     //count word parts 
     let vowelSplit = Regex.Split(wordMinus3Affix, "[^aeiouy]") 
     let mutable wordPartCount = 0 
     for wordPart in vowelSplit do 
      if wordPart.Length > 0 then 
       wordPartCount <- wordPartCount + 1 

     //base syllable count 
     let mutable baseSyllableCount = oneAffixCount + twoAffixCount + threeAffixCount + wordPartCount 

     //handle degenerate cases 
     let oneSyllableCorrectionCount = CountMatches oneSyllableCorrection word //count two as one: subtract 
     let twoSyllableCorrectionCount = CountMatches twoSyllableCorrection word //count one as two: add 

     baseSyllableCount <- baseSyllableCount - oneSyllableCorrectionCount + twoSyllableCorrectionCount 

     //we always have 1 syllable in a word 
     if baseSyllableCount > 0 then 
      baseSyllableCount 
     else 
      1 

文カウントを処理するために、私はスタンフォードパーサのnugetパッケージを使用して、このラッパーを作成しました:

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Threading.Tasks; 
using edu.stanford.nlp.process; 
using edu.stanford.nlp.util; 

namespace StanfordWrapper 
{ 
    public class SentenceTokenizer 
    { 
     public static readonly TokenizerFactory TokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), 
       "normalizeParentheses=false,normalizeOtherBrackets=false,invertible=true"); 

     public static List<string> Go(string input) 
     { 
      java.io.Reader reader = new java.io.StringReader(input); 
      DocumentPreprocessor dp = new DocumentPreprocessor(reader); 
      dp.setTokenizerFactory(TokenizerFactory); 

      List<string> output = new List<string>(); 
      foreach (java.util.List sentence in dp) 
      { 
       output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence)); 
      } 

      return output; 
     } 
    } 
} 

ラッパーは、B/Cパーサ便利ですJavaで。ナゲットはIKVMCを使用して.NETで呼び出し可能にします。

は最後に、私は/トークン化をきれいにするいくつかのコードを使用して単語のカウントのために:参照、

let FleshKincaidGradeLevel(text) = 
    let sentences = StanfordWrapper.SentenceTokenizer.Go(text) |> Seq.toArray 

    let words = sentences |> Array.map(fun x -> TextNormalizer.Normalize x false) |> Array.collect(fun x -> x.Split(' ')) 

    let syllableCount = words |> Array.map SyllableCount2 |> Array.sum 

    //FKGL formula: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests 
    (0.39 * (float words.Length)/(float sentences.Length)) + (11.8 * (float syllableCount)/(float words.Length)) - 15.59