シンプルなコードでの作業:異なる振る舞いアルゴリズムの異なるオペレーティングシステム上のUTF8アルゴリズムの
#include <iostream>
#include <string>
std::string::size_type GetLengthWithUTF(std::string &sValue);
int main()
{
std::string sTestValueUTF8 = "\xD0\xB6\xD0\xB6\xD0\xB6";
std::string sTestValueASCII = "\x67\x67\x67";
std::string sTestValueMIX = "\x67\x67\x67\xD0\xB6\xD0\xB6\xD0\xB6";
std::string::size_type iFuncResult = 0;
std::cout << "=========== START TEST ==========\n\n";
std::cout << "+TEST UTF8 STRING\n";
std::cout << "+----+Bytes of string (sTestValueUTF8.length()) = " << sTestValueUTF8.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueUTF8);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueUTF8 << "\")) = " << iFuncResult<< "\n\n";
std::cout << "+TEST ASCII STRING\n";
std::cout << "+----+Bytes of string (sTestValueASCII.length()) = " << sTestValueASCII.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueASCII);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueASCII << "\")) = " << iFuncResult<< "\n\n";
std::cout << "+TEST MIX STRING\n";
std::cout << "+----+Bytes of string (sTestValueMIX.length()) = " << sTestValueMIX.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueMIX);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueMIX << "\")) = " << iFuncResult<< "\n\n";
std::cout << "\n=========== END TEST ==========\n\n";
}
std::string::size_type GetLengthWithUTF(std::string &sValue)
{
std::cout << " +----+START GetLengthWithUTF\n";
std::cout << " +Input string is: " << sValue << "\n";
std::string::size_type i;
std::cout << " +Start cycle\n";
int iCountUTF8characters = 0;
for (i = 0; i < sValue.length(); i++)
{
std::cout << " +----+Iteration N " << i << "\n";
std::cout << " +Current character is: " << sValue[i] << ", integer value = " << (int)sValue[i] << "\n";
if (sValue[i] > 127)
{
iCountUTF8characters++;
std::cout << " +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: " << iCountUTF8characters << "\n";
}
else
{
std::cout << " +----+If statement (sValue[i] > 127) is false.\n";
}
}
std::cout << " +End cycle\n";
iCountUTF8characters = iCountUTF8characters/2;
std::cout << " +Return sValue.length() - (iCountUTF8characters/2) ---> " << sValue.length() << " - (" << iCountUTF8characters << "/2) = " << (sValue.length() - (std::string::size_type)iCountUTF8characters) <<"\n";
std::cout << " +----+ASCIID GetLengthWithUTF\n";
return (sValue.length() - (std::string::size_type)iCountUTF8characters);
}
コンソールコンパイルコマンド:
AIX 6
g++ -o test test.cpp
RHEL Serverの6.7サンティアゴ
g++ -o test test.cpp
Microsoft W indows v10.0.14393
cl /EHsc test.cpp
結果:
AIX 6
=========== START TEST ==========
+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1
+----+Iteration N 1
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2
+----+Iteration N 2
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3
+----+Iteration N 3
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4
+----+Iteration N 4
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5
+----+Iteration N 5
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (3/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 3
+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3
+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1
+----+Iteration N 4
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2
+----+Iteration N 5
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3
+----+Iteration N 6
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4
+----+Iteration N 7
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5
+----+Iteration N 8
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (3/2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 6
=========== END TEST ==========
RHEL Serverの6.7サンティアゴ
=========== START TEST ==========
+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 6
+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3
+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 6
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 7
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 8
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 9
=========== END TEST ==========
のMicrosoft Windows v10.0.14393
=========== START TEST ==========
+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 6
+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3
+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 6
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 7
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 8
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 9
=========== END TEST ==========
アルゴリズムでは文字列内の文字数を計算する必要があります。テストの結果からわかるように、AIXでは正しく動作します。
私は、このオペレーティングシステムのアルゴリズムの振る舞いについて、誰かが私の理解を助けてくれることを嬉しく思います。アルゴリズムはOS AIXで作成されました。 AIXからLINUXへの移行後に問題があることが判明しました。私はより多くのテストを行いました。私の主な問題は、AIXの下でどのように悪いアルゴリズムが動作するかです。私はそれを論理的な方法で説明することはできません。
このアルゴリズムが正しくありません。それはユニコードの小さなサブセットでしか動作しません。より良い方法は、 '0x&0xC0!= 0x80'のようなバイト数を数えて、非初期コード(0x80-0xBFの範囲内のもの)のみを削除することです。 – rici
ええ、あなたは正しいです。このアルゴリズムは古くて古く、200文字以下の文字列をチェックしています。しかし、あなたは上記のようなアルゴリズムを変更しました。私は問題を知るだけで面白かった。 – stoyanov