2011-07-28 7 views

答えて

5

私が気づいているjaro-winkler距離のための機能は組み込まれていません。 @Itzyはすでに私が知っている唯一のものを参照しています。あなたがそれに気づいても、proc fcmpであなた自身の機能をロールすることができます。私はあなたに下のコードを頭から始めます。私はちょうどそれについてのウィキペディアの記事に従おうとしました。 Bill Winklerのstrcmp.cファイルの完璧な表現ではないことは確かであり、多くのバグがありそうです。

proc fcmp outlib=work.jaro.chars; 

    subroutine jaromatch (string1 $ , string2 $ , matchChars $); 
    outargs matchChars; 
    /* Returns number of matched characters between 2 strings excluding blanks*/ 
    /* two chars from string1 and string2 are considered matching 
     if they are no farther than floor(max(|s1|, |s2|)/2)-1 */ 

    str1_len = length(strip(string1)); 
    str2_len = length(strip(string2)); 

    allowedDist = floor(max(str1_len, str2_len)/2) -1; 

    matchChars=""; 

    /* walk through string 1 and match characters to string2 */ 
    do i= 1 to str1_len; 
     x=substr(string1,i,1); 
     position = findc(string2,x ,max(1,i-allowedDist)); 
     if position > 0 then do; 
      if position - i <= allowedDist then do; 
      y=substr(string2,position,1); 
      /* build list of matched characters */ 
      matchChars=cats(matchChars,y); 
     end; 
     end; 
    end; 
    matchChars = strip(matchChars); 
    endsub; 


    function jarotrans (string1 $ , string2 $); 
    ntrans = 0; 
    ubnd = min(length(strip(string1)), length(strip(string2))); 
    do i = 1 to ubnd; 
     if substr(string1,i,1) ne substr(string2,i,1) then do; 
     ntrans + 1; 
     end; 
    end; 
    return(ntrans/2); 
    endsub; 

    function getPrefixlen(string1 $ , string2 $, maxprelen); 
    /* get the length of the matching characters at the beginning */ 
    n = min(maxprelen, length(string1), length(string2)); 
    do i = 1 to n; 
     if substr(string1,i,1) ne substr(string2,i,1) 
     then return(max(1,i-1)); 
    end; 
    endsub; 

    function jarodist(string1 $, string2 $); 
    /* get number of matched characters */ 
    call jaromatch(string1, string2, m1); 
    m1_len = length(m1); 
    if m1_len = 0 then return(0); 
    call jaromatch(string2, string1, m2); 
    m2_len = length(m2); 
    if m2_len = 0 then return(0); 

    /* get number of transposed characters */ 
    ntrans = jarotrans(m1, m2); 
    put m1_len= m2_len= ntrans= ; 
    j_dist = (m1_len/length(string1) 
      + m2_len/length(string2) 
      + (m1_len-ntrans)/m1_len)/3; 
    return(j_dist); 
    endsub; 

    function jarowink(string1 $, string2 $, prefixscale); 
    jarodist=jarodist(string1, string2); 
    prelen=getPrefixlen(string1, string2, 4); 
    if prelen = 0 then return(jarodist); 
    else return(jarodist + prelen * prefixscale * (1-jarodist)); 
    endsub; 

run;quit; 

/* tell SAS where to find the functions we just wrote */ 
option cmplib=work.jaro; 

/* Now let's try it out! */ 
data _null_; 
string1='DIXON'; 
string2='DICKSONX'; 
x=jarodist(string1, string2); 
y=jarowink(string1, string2, 0.1); 
put x= y=; 
run; 
2

私はそうは思わない。 Levenshtein距離(complev関数)または一般編集距離(compged)を実行できますが、他の編集距離関数は見ていません。

SASでこれを実行すると死んだ場合は、PROC IMLにプログラムを書くことができます。

2

cmjohnsのコードを変更して修正しました。私を始めてくれた彼/彼女のおかげです。 Winklerは、彼の論文Winkler、W. E.(2006)にいくつかの例を公開している 。 "レコードのリンクと現在の概要"研究の方向 "を参照してください。研究報告書シリーズ、RRS。 (表6を参照)コードをテストするために例を使用しました。

proc fcmp outlib=work.jaro.chars; 

    /* Returns matched characters between 2 strings. Two chars from string1 and string2 
    are considered matching if they are no farther apart than 
    floor(max(|s1|, |s2|)/2)-1              */ 
    function jaromatch(string1 $, string2 $) $ 40; 
    length matchChars $ 40; 

    str1_len = lengthn(string1); 
    str2_len = lengthn(string2); 

    allowedDist = floor(max(str1_len, str2_len)/2) - 1; 

    *** walk through string1 and match characters to string2 ***; 
    matchChars=""; 
    do i= 1 to str1_len; 
     *** get the part of string2 to search ***; 
     allowed_start = max(1, i - allowedDist);  *** starting char position ***; 
     allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1); 

     *** find i char from string1 in string2 within the allowedDist ***; 
     position = findc(allowed_str2, substr(string1, i, 1)); 
     if position > 0 
     then do; 
    matchChars = cats(matchChars, substr(allowed_str2, position, 1)); 
    *** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***; 
    substr(string2, allowed_start + position -1, 1) = '~'; 
     end; 
    end; 
    return(strip(matchChars)); 
    endsub; 

    /* count the number of "half" transpositions */ 
    function jarotrans(string1 $, string2 $); 
    ntrans = 0; 
    do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2))); 
     if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1; 
    end; 

    return(ntrans/2); 
    endsub; 

    /* get the length of the matching characters at the beginning */ 
    function getPrefixlen(string1 $, string2 $, maxprelen); 
    n = min(maxprelen, lengthn(string1), lengthn(string2)); 

    if n = 0 
    then return(0); 
    else do; 
     do i = 1 to n; 
    if substr(string1, i, 1) ne substr(string2, i, 1) 
    then return(i - 1); 
     end; 
     return(n); *** all maxprelen characters match ***; 
    end; 
    endsub; 

    /* calc the jaro distance */ 
    function jarodist(string1 $, string2 $); 
    *** get number of matched characters in string1 ***; 
    m1 = jaromatch(string1, string2); 
    m1_len = lengthn(m1); 
    if m1_len = 0 then return(0); 

    *** get number of matched characters in string2 ***; 
    m2 = jaromatch(string2, string1); 
    m2_len = lengthn(m2); 
    if m2_len = 0 then return(0); 

    *** get number of transposed characters ***; 
    ntrans = jarotrans(m1, m2); 

    *** calc jaro distance ***; 
    j_dist = (m1_len/lengthn(string1) + 
      m2_len/lengthn(string2) + 
      (m1_len - ntrans)/m1_len 
     )/3; 

    return(j_dist); 
    endsub; 

    /* calc the jaro-winkler distance */ 
    function jarowink(string1 $, string2 $, prefixscale); 
    string1 = upcase(strip(string1)); 
    string2 = upcase(strip(string2)); 

    *** check for trivial case and calc JW if needed ***; 
    if string1 = string2 
    then return(1.0); 
    else do; 
     jarodist = jarodist(string1, string2); 
     prelen = getPrefixlen(string1, string2, 4); 
     return(jarodist + prelen * prefixscale * (1 - jarodist)); 
    end; 
    endsub; 

run; 

*** tell SAS where to find the functions we just wrote ***; 
option cmplib=work.jaro; 

    /* test code */ 
data _null_; 
    put 'SHACKLEFORD SHACKELFORD 0.982'; 
    jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1); 
    put jw=; 
    put; 
    put 'DUNNINGHAM CUNNIGHAM 0.896'; 
    jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1); 
    put jw=; 
    put; 
    put 'NICHLESON NICHULSON 0.956'; 
    jw = jarowink('NICHLESON', 'NICHULSON', 0.1); 
    put jw=; 
    put; 
    put 'JONES  JOHNSON  0.832'; 
    jw = jarowink('JONES', 'JOHNSON', 0.1); 
    put jw=; 
    put; 
    put 'MASSEY  MASSIE  0.933'; 
    jw = jarowink('MASSEY', 'MASSIE', 0.1); 
    put jw=; 
    put; 
    put 'ABROMS  ABRAMS  0.922'; 
    jw = jarowink('ABROMS', 'ABRAMS', 0.1); 
    put jw=; 
    put; 
    put 'JERALDINE GERALDINE 0.926'; 
    jw = jarowink('JERALDINE', 'GERALDINE', 0.1); 
    put jw=; 
    put; 
    put 'MARHTA  MARTHA  0.961'; 
    jw = jarowink('MARHTA', 'MARTHA', 0.1); 
    put jw=; 
    put; 
    put 'MICHELLE MICHAEL  0.921'; 
    jw = jarowink('MICHELLE', 'MICHAEL', 0.1); 
    put jw=; 
    put; 
    put 'JULIES  JULIUS  0.933'; 
    jw = jarowink('JULIES', 'JULIUS', 0.1); 
    put jw=; 
    put; 
    put 'TANYA  TONYA  0.880'; 
    jw = jarowink('TANYA', 'TONYA', 0.1); 
    put jw=; 
    put; 
    put 'DWAYNE  DUANE  0.840'; 
    jw = jarowink('DWAYNE', 'DUANE', 0.1); 
    put jw=; 
    put; 
    put 'SEAN  SUSAN  0.805'; 
    jw = jarowink('SEAN', 'SUSAN', 0.1); 
    put jw=; 
    put; 
    put 'JON   JOHN  0.933'; 
    jw = jarowink('JON', 'JOHN', 0.1); 
    put jw=; 
    put; 
run;