2016-11-18 7 views
3

シンプルなコードでの作業:異なる振る舞いアルゴリズムの異なるオペレーティングシステム上のUTF8アルゴリズムの

#include <iostream> 
#include <string> 

std::string::size_type GetLengthWithUTF(std::string &sValue); 

int main() 
{ 
    std::string sTestValueUTF8 = "\xD0\xB6\xD0\xB6\xD0\xB6"; 
    std::string sTestValueASCII = "\x67\x67\x67"; 
    std::string sTestValueMIX = "\x67\x67\x67\xD0\xB6\xD0\xB6\xD0\xB6"; 
    std::string::size_type iFuncResult = 0; 

    std::cout << "=========== START TEST ==========\n\n"; 

    std::cout << "+TEST UTF8 STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueUTF8.length()) = " << sTestValueUTF8.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueUTF8); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueUTF8 << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "+TEST ASCII STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueASCII.length()) = " << sTestValueASCII.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueASCII); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueASCII << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "+TEST MIX STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueMIX.length()) = " << sTestValueMIX.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueMIX); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueMIX << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "\n=========== END TEST ==========\n\n"; 
} 

std::string::size_type GetLengthWithUTF(std::string &sValue) 
{ 
    std::cout << "  +----+START GetLengthWithUTF\n"; 
    std::cout << "   +Input string is: " << sValue << "\n"; 
    std::string::size_type i; 
    std::cout << "   +Start cycle\n"; 
    int iCountUTF8characters = 0; 
    for (i = 0; i < sValue.length(); i++) 
    { 
     std::cout << "   +----+Iteration N " << i << "\n"; 
     std::cout << "    +Current character is: " << sValue[i] << ", integer value = " << (int)sValue[i] << "\n"; 
     if (sValue[i] > 127) 
     { 
      iCountUTF8characters++; 
      std::cout << "    +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: " << iCountUTF8characters << "\n"; 
     } 
     else 
     { 
      std::cout << "    +----+If statement (sValue[i] > 127) is false.\n"; 
     } 
    } 

    std::cout << "   +End cycle\n"; 
    iCountUTF8characters = iCountUTF8characters/2; 
    std::cout << "   +Return sValue.length() - (iCountUTF8characters/2) ---> " << sValue.length() << " - (" << iCountUTF8characters << "/2) = " << (sValue.length() - (std::string::size_type)iCountUTF8characters) <<"\n"; 
    std::cout << "  +----+ASCIID GetLengthWithUTF\n"; 
    return (sValue.length() - (std::string::size_type)iCountUTF8characters); 
} 

コンソールコンパイルコマンド:

AIX 6

g++ -o test test.cpp 

RHEL Serverの6.7サンティアゴ

g++ -o test test.cpp 

Microsoft W indows v10.0.14393

cl /EHsc test.cpp 



結果:

AIX 6

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2 
      +----+Iteration N 2 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4 
      +----+Iteration N 4 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (3/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 3 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2 
      +----+Iteration N 5 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4 
      +----+Iteration N 7 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (3/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 6 


=========== END TEST ========== 

RHEL Serverの6.7サンティアゴ

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 6 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 7 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 9 


=========== END TEST ========== 

のMicrosoft Windows v10.0.14393

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 6 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 7 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 9 


=========== END TEST ========== 

アルゴリズムでは文字列内の文字数を計算する必要があります。テストの結果からわかるように、AIXでは正しく動作します。

私は、このオペレーティングシステムのアルゴリズムの振る舞いについて、誰かが私の理解を助けてくれることを嬉しく思います。アルゴリズムはOS AIXで作成されました。 AIXからLINUXへの移行後に問題があることが判明しました。私はより多くのテストを行いました。私の主な問題は、AIXの下でどのように悪いアルゴリズムが動作するかです。私はそれを論理的な方法で説明することはできません。

+2

このアルゴリズムが正しくありません。それはユニコードの小さなサブセットでしか動作しません。より良い方法は、 '0x&0xC0!= 0x80'のようなバイト数を数えて、非初期コード(0x80-0xBFの範囲内のもの)のみを削除することです。 – rici

+0

ええ、あなたは正しいです。このアルゴリズムは古くて古く、200文字以下の文字列をチェックしています。しかし、あなたは上記のようなアルゴリズムを変更しました。私は問題を知るだけで面白かった。 – stoyanov

答えて

4

標準で許可されている文字の記号を扱う方法が2種類あると思われます。ご使用のAIXコンパイラーはcharを符号なしとして扱いますが、他の2つのシステムはそれらを符号付きとして扱います。

符号なし文字を使用するシステムでは、条件sValue[i] > 127は、予想どおりの動作をします。しかし、同じ表現は、符号付き文字を持つシステムでは決して成功しません。

これは、コード128以上の文字に対して負の数を取得する理由です。たとえば、208は、シングルバイトの符号付き値として扱われる場合、-48になります。

あなたは、符号なしに変換を強制することにより、ビットマスクで8のビットをチェックすることにより、この問題を解決することができ

if (sValue[i] & 128) { 
    ... // MSB is set 
} 
+0

くそー!そのとおり!どうもありがとうございました!!! (unsigned(sValue [i])> 127)が遅いバージョンだが、一部の開発者にとってはより読みやすい場合) – stoyanov

関連する問題