N個のログファイルをマージして順番を維持する

N個の異なるサービスがN個の異なるサービスから実行されています。私はNファイルを1つのファイルにマージして、時間順を維持したい。ファイルサイズは数KBからGBまでさまざまです。N個のログファイルをマージして順番を維持する

Nログファイルが同じフォーマットを持っている、そしてそれはのようである：

********** LOGGING SESSION STARTED ************ 
* Hmsoa Version: 2.4.0.12 
* Exe Path: c:\program files (x86)\silicon biosystems\deparray300a_driver\deparray300a_driver.exe 
* Exe Version: 1.6.0.154 
************************************************ 


TIME = 2017/02/01 11:12:12,180 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'Connect'->Enter; 
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'Connect'->Exit=0; 
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CCisProxyLocal CONNECT - ok'->Enter; 
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CRecoveryAxesProxyLocal CONNECT - ok'->Enter; 
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CAmplifierProxyLocalV3 CONNECT - ok'->Enter; 
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'SYSTEM_DIAGNOSIS_GET'->Enter; 
TIME = 2017/02/01 11:12:12,211 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'SYSTEM_DIAGNOSIS_GET'->Exit=0; 
TIME = 2017/02/01 11:12:12,211 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'LBL_SQUARE_SET'->Enter; 
TIME = 2017/02/01 11:12:12,219 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'LBL_SQUARE_SET'->Exit=0;

私はすでにN個の異なるファイルを持っているので、私がやったことは、これまでのための単一の行を読んで、外部ソートアルゴリズムを適用することです各ファイル：

#include "stdafx.h" 
#include "boost/regex.hpp" 
#include "boost/lexical_cast.hpp" 
#include "boost\filesystem.hpp" 
#include <string> 
#include <fstream> 
#include <iostream> 
#include <algorithm> 
#include <sstream> 
#include <climits> 
#include <ctime> 
namespace fs = boost::filesystem; 

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))"); 
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))"); 
static const std::string path("E:\\2017-02-01"); 
//static const std::string path("E:\\TestLog"); 

unsigned long time2Milleseconds(const std::string & time) 
{ 
    int a, b, c, d; 
    if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3) 
     return a * 3600000 + b * 60000 + c * 1000 + d; 
} 

void readAllFilesUntilLine7(std::vector<std::pair<std::ifstream, std::string>> & vifs) 
{ 
    std::string line; 
    for (int i = 0; i < vifs.size(); ++i) 
    { 
     int lineNumber = 0; 
     while (lineNumber != 7 && std::getline(vifs[i].first, line)) 
     { 
      ++lineNumber; 
     } 
    } 
} 

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logTime, std::vector<std::string> & lines, int index, int & counter) 
{ 
    std::string line; 
    boost::smatch what; 
    if (std::getline(vifs[index].first, line)) 
    { 
     if (boost::regex_search(line, what, expression)) 
     { 
      logTime[index] = time2Milleseconds(what[1]); 
     } 
     lines[index] = line; 
    } 
    else 
    { 
     --counter; 
     logTime[index] = ULONG_MAX; 
    } 
} 

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logTime, std::vector<std::string> & lines, std::ofstream & file, int & counter) 
{ 
    std::string line; 
    boost::smatch what; 
    int index = 0; 
    for (int i = 0; i < vifs.size(); ++i) 
    { 
     checkRegex(vifs, logTime, lines, i, counter); 
    } 
    index = min_element(logTime.begin(), logTime.end()) - logTime.begin(); 
    file << lines[index] << " --> " << vifs[index].second << "\n"; 
    while (true) 
    { 
     checkRegex(vifs, logTime, lines, index, counter); 
     index = min_element(logTime.begin(), logTime.end()) - logTime.begin(); 
     if (0 == counter) 
      break; 
     file << lines[index] << " --> " << vifs[index].second << "\n"; 
    } 
} 

int main() 
{ 
    clock_t begin = clock(); 
    int cnt = std::count_if(fs::directory_iterator(path),fs::directory_iterator(),static_cast<bool(*)(const fs::path&)>(fs::is_regular_file)); 
    std::vector<std::pair<std::ifstream, std::string>> vifs(cnt); 
    int index = 0; 
    boost::smatch what; 
    std::string file; 
    for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d) 
    { 
     if (fs::is_regular_file(d->path())) 
     { 
      file = d->path().filename().string(); 
      if (boost::regex_search(file, what, nameFileEx)) 
      { 
       vifs[index++] = std::make_pair(std::ifstream(d->path().string()), what[1]); 
      } 
     } 
    } 
    std::vector<unsigned long> logTime(cnt, ULONG_MAX); 
    std::vector<std::string> lines(cnt); 
    std::ofstream filename(path + "\\TestLog.txt"); 
    readAllFilesUntilLine7(vifs); 
    mergeFiles(vifs, logTime, lines, filename, cnt); 
    filename.close(); 
    clock_t end = clock(); 
    double elapsed_secs = double(end - begin)/CLOCKS_PER_SEC; 
    std::cout << "Elapsed time = " << elapsed_secs << "\n"; 
    return 0; 
}

これはまさにそれがやることですが、遅いです。 1KB〜250MBの範囲の82個のファイルをマージし、6000000行以上の最終ファイルを作成するには70分かかります。

どのようにアルゴリズムを高速化できますか？どんな助けでも大歓迎です！

私も、ヒープとバージョンを実装している

UPDATE：

Data.h：

#pragma once 

#include <string> 

class Data 
{ 
public: 
    Data(DWORD index, 
     const std::string & line, 
     ULONG time); 
    ~Data(); 
    inline const ULONG getTime() const {return time; } 
    inline const DWORD getIndex() const { return index; } 
    inline const std::string getLine() const { return line; } 
private: 
    DWORD index; 
    std::string line; 
    ULONG time; 
}; 

class Compare 
{ 
public: 
    bool operator()(const Data & lhs, const Data & rhs) { return lhs.getTime() > rhs.getTime(); }; 
};

Data.cpp：

#include "stdafx.h" 
#include "Data.h" 


Data::Data(DWORD i_index, 
      const std::string & i_line, 
      ULONG i_time) 
    : index(i_index) 
    , line(i_line) 
    , time(i_time) 
{ 
} 


Data::~Data() 
{ 
}

メイン。 cpp：

#include "stdafx.h" 
#include "boost/regex.hpp" 
#include "boost/lexical_cast.hpp" 
#include "boost\filesystem.hpp" 
#include <string> 
#include <fstream> 
#include <iostream> 
#include <algorithm> 
#include <sstream> 
#include <climits> 
#include <ctime> 
#include <queue> 
#include "Data.h" 
namespace fs = boost::filesystem; 

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))"); 
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))"); 
static const std::string path("E:\\2017-02-01"); 
//static const std::string path("E:\\TestLog"); 

unsigned long time2Milleseconds(const std::string & time) 
{ 
    int a, b, c, d; 
    if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3) 
     return a * 3600000 + b * 60000 + c * 1000 + d; 
} 

void initializeHeap(std::ifstream & ifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, const int index) 
{ 
    ULONG time; 
    std::string line; 
    boost::smatch what; 
    bool match = false; 
    while (!match && std::getline(ifs, line)) 
    { 
     if (boost::regex_search(line, what, expression)) 
     { 
      time = time2Milleseconds(what[1]); 
      myHeap.push(Data(index, line, time)); 
      match = true; 
     } 
    } 
} 

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, ULONG time, const int index) 
{ 
    std::string line; 
    boost::smatch what; 
    if (std::getline(vifs[index].first, line)) 
    { 
     if (boost::regex_search(line, what, expression)) 
     { 
      time = time2Milleseconds(what[1]); 
     } 
     myHeap.push(Data(index, line, time)); 
    } 
} 

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, std::ofstream & file) 
{ 
    int index = 0; 
    ULONG time = 0; 
    while (!myHeap.empty()) 
    { 
     index = myHeap.top().getIndex(); 
     time = myHeap.top().getTime(); 
     file << myHeap.top().getLine() << " --> " << vifs[index].second << "\n"; 
     myHeap.pop(); 
     checkRegex(vifs, myHeap, time, index); 
    } 
} 

int main() 
{ 
    clock_t begin = clock(); 
    int cnt = std::count_if(fs::directory_iterator(path), fs::directory_iterator(), static_cast<bool(*)(const fs::path&)>(fs::is_regular_file)); 
    std::priority_queue<Data, std::vector<Data>, Compare> myHeap; 
    std::vector<std::pair<std::ifstream, std::string>> vifs(cnt); 
    int index = 0; 
    boost::smatch what; 
    std::string file; 
    for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d) 
    { 
     if (fs::is_regular_file(d->path())) 
     { 
      file = d->path().filename().string(); 
      if (boost::regex_search(file, what, nameFileEx)) 
      { 
       vifs[index] = std::make_pair(std::ifstream(d->path().string()), what[1]); 
       initializeHeap(vifs[index].first, myHeap, index); 
       ++index; 
      } 
     } 
    } 
    std::ofstream filename(path + "\\TestLog.txt"); 
    mergeFiles(vifs, myHeap, filename); 
    filename.close(); 
    clock_t end = clock(); 
    double elapsed_secs = double(end - begin)/CLOCKS_PER_SEC; 
    std::cout << "Elapsed time = " << elapsed_secs << "\n"; 
    return 0; 
}

このすべての作業の後、昨日私はDebugでプログラムを実行することに気付きました。

このように約27秒、またはヒープ構造の私の実装は次のとおりです。

ベクトルの実装：約25秒
ヒープ実装リリースで実装の両方を起動、私は以下の結果を得ました最適化されていないか、2つの実装は実行時間が同じです。

私は実行をスピードアップするために他に何かできますか？

出典

2017-04-05 BugsFree

[GNU sort]（https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html）には、 '-m' /' --merge'オプションがあります既にソートされたファイルをマージします。新しいプログラムを書くよりも、それを使うほうが簡単かもしれません。 – ephemient

私は最初のステップとして、ここでIOまたは処理（CPU）がボトルネックかどうかを調べるべきだと思います。（iotop？） –

'file << lines [index] << std :: endl;'これは問題となる可能性があります。 'std :: endl'は（少なくともC++の標準ライブラリでは）内部バッファをフラッシュします。 ''\ n' 'をうまく使います。 –

これは、高速かつ低メモリで行うことができます。最初に検討してください：

各ファイルから1行を読み込みます（したがって、いつでもN行だけがメモリに格納されます）。
N行の中で最小のものを見つけて出力します。
メモリには、現在の出力が出てきたファイルの次の行に出力された値だけを置き換えます（EOFの場合を考慮してください）。

Mは、（結合されたすべてのログの長さすなわち）出力ファイルの長さであれば、些細な実装がO(N * M)だろう。

ただし、ヒープを使用することで上記を改善でき、時間をO(M log N)に減らすことができます。つまり、Nのメモリ内要素をヒープに配置します。一番上の要素をポップアウトして最小の要素を出力します。次に、新しい行を読み込むときに、その行をヒープ上に戻してください。

出典

2017-04-05 22:07:43 TheGreatContini

ヒープを除いて、それはまさにコードが何をしているのですか？ –

@DanielJour速度最適化はヒープです。メモリの最適化は、その時点でメモリに 'N'行しかないのですか？ – TheGreatContini

ヒープが本当にここで改善するのではないかと疑問に思っています。質問の例には82個のファイルがあります。最小のものに対して82行を直線的に検索しても、ヒープを構築して挿入と削除を管理するよりも大幅に遅くなることはありません。問題のコードはすでに 'N'行のみを使用しています：' std :: vector lines（cnt）; '' cnt'はファイル数 'N'です。 –

N個のログファイルをマージして順番を維持する

答えて

関連する問題