5

私は、Microsoft Computer Visionを使用して領収書を読み取ります。領収書の構造に一致するOCR/Computer Visionの結果を処理します。

結果は、列でグループ化された領域に順序付けられます(例:数量、製品名、金額は3つの異なる地域にあります。

製品のリスト全体が1つの領域であり、各行が製品であることが望ましいと思います。

これを達成するためにコンピュータビジョンを設定する方法はありますか、またはすべての単語の位置が利用可能であるため結果の後処理で使用できる優れたテクニックやライブラリがあります。

ベローは、コンピュータビジョンのレシートと結果のイメージです。

receipt

{ 
    "language": "sv", 
    "textAngle": 2.0999999999999632, 
    "orientation": "Up", 
    "regions": [ 
    { 
     "boundingBox": "1012,450,660,326", 
     "lines": [ 
     { 
      "boundingBox": "1362,450,76,30", 
      "words": [ 
      { 
       "boundingBox": "1362,450,76,30", 
       "text": "JULA" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1207,486,465,49", 
      "words": [ 
      { 
       "boundingBox": "1207,502,172,33", 
       "text": "Ekslinsan" 
      }, 
      { 
       "boundingBox": "1400,497,51,30", 
       "text": "3B," 
      }, 
      { 
       "boundingBox": "1479,491,95,33", 
       "text": "25467" 
      }, 
      { 
       "boundingBox": "1595,486,77,32", 
       "text": "VALA" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1304,539,265,38", 
      "words": [ 
      { 
       "boundingBox": "1304,539,265,38", 
       "text": "SE5S6944785601" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1245,584,369,44", 
      "words": [ 
      { 
       "boundingBox": "1245,594,148,34", 
       "text": "Telefon:" 
      }, 
      { 
       "boundingBox": "1421,584,193,37", 
       "text": "042-324040" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1012,695,269,35", 
      "words": [ 
      { 
       "boundingBox": "1012,702,75,28", 
       "text": "Kund" 
      }, 
      { 
       "boundingBox": "1109,695,172,33", 
       "text": "072202787" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1109,738,289,38", 
      "words": [ 
      { 
       "boundingBox": "1109,744,133,32", 
       "text": "LILLVIK" 
      }, 
      { 
       "boundingBox": "1265,738,133,32", 
       "text": "ANDREAS" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1085,845,14,516", 
     "lines": [ 
     { 
      "boundingBox": "1090,845,9,29", 
      "words": [ 
      { 
       "boundingBox": "1090,845,9,29", 
       "text": "1" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1087,1037,9,28", 
      "words": [ 
      { 
       "boundingBox": "1087,1037,9,28", 
       "text": "1" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1086,1133,9,27", 
      "words": [ 
      { 
       "boundingBox": "1086,1133,9,27", 
       "text": "I" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1085,1332,9,29", 
      "words": [ 
      { 
       "boundingBox": "1085,1332,9,29", 
       "text": "1" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1122,839,454,573", 
     "lines": [ 
     { 
      "boundingBox": "1128,839,173,33", 
      "words": [ 
      { 
       "boundingBox": "1128,843,36,29", 
       "text": "ST" 
      }, 
      { 
       "boundingBox": "1186,839,115,30", 
       "text": "661107" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1127,879,389,41", 
      "words": [ 
      { 
       "boundingBox": "1127,887,232,33", 
       "text": "VERKTYGSLÅDR" 
      }, 
      { 
       "boundingBox": "1382,883,36,28", 
       "text": "JC" 
      }, 
      { 
       "boundingBox": "1441,882,16,26", 
       "text": "5" 
      }, 
      { 
       "boundingBox": "1481,879,35,28", 
       "text": "ÅR" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1126,935,173,34", 
      "words": [ 
      { 
       "boundingBox": "1126,940,36,29", 
       "text": "ST" 
      }, 
      { 
       "boundingBox": "1187,935,112,32", 
       "text": "181460" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1126,967,450,50", 
      "words": [ 
      { 
       "boundingBox": "1126,987,75,30", 
       "text": "BORR" 
      }, 
      { 
       "boundingBox": "1224,977,193,35", 
       "text": "GLAS/KRKEL" 
      }, 
      { 
       "boundingBox": "1440,974,16,27", 
       "text": "ø" 
      }, 
      { 
       "boundingBox": "1482,971,34,27", 
       "text": "10" 
      }, 
      { 
       "boundingBox": "1539,967,37,28", 
       "text": "MM" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1125,1027,173,37", 
      "words": [ 
      { 
       "boundingBox": "1125,1036,36,28", 
       "text": "ST" 
      }, 
      { 
       "boundingBox": "1185,1027,113,34", 
       "text": "181740" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1124,1062,432,49", 
      "words": [ 
      { 
       "boundingBox": "1124,1071,252,40", 
       "text": "UNIVERSALBORR" 
      }, 
      { 
       "boundingBox": "1400,1066,96,32", 
       "text": "8X120" 
      }, 
      { 
       "boundingBox": "1519,1062,37,30", 
       "text": "MM" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1123,1125,175,34", 
      "words": [ 
      { 
       "boundingBox": "1123,1129,36,30", 
       "text": "ST" 
      }, 
      { 
       "boundingBox": "1183,1125,115,32", 
       "text": "181738" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1122,1164,416,44", 
      "words": [ 
      { 
       "boundingBox": "1122,1170,255,38", 
       "text": "UNIVERSRLBORR" 
      }, 
      { 
       "boundingBox": "1501,1164,37,31", 
       "text": "MM" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1123,1225,170,33", 
      "words": [ 
      { 
       "boundingBox": "1123,1228,36,30", 
       "text": "ST" 
      }, 
      { 
       "boundingBox": "1183,1225,110,32", 
       "text": "316401" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1123,1270,355,39", 
      "words": [ 
      { 
       "boundingBox": "1123,1275,216,34", 
       "text": "LÅSCYLINDER" 
      }, 
      { 
       "boundingBox": "1362,1270,116,33", 
       "text": "2-PACK" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1123,1327,177,34", 
      "words": [ 
      { 
       "boundingBox": "1123,1330,37,31", 
       "text": "ST" 
      }, 
      { 
       "boundingBox": "1183,1327,117,32", 
       "text": "396026" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1124,1373,356,39", 
      "words": [ 
      { 
       "boundingBox": "1124,1377,216,35", 
       "text": "LÅSCYLINDER" 
      }, 
      { 
       "boundingBox": "1363,1373,117,33", 
       "text": "2-PRCK" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1644,820,118,524", 
     "lines": [ 
     { 
      "boundingBox": "1658,820,96,31", 
      "words": [ 
      { 
       "boundingBox": "1658,820,96,31", 
       "text": "79,00" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1659,912,97,31", 
      "words": [ 
      { 
       "boundingBox": "1659,916,50,27", 
       "text": "44," 
      }, 
      { 
       "boundingBox": "1719,912,37,28", 
       "text": "90" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1659,1004,98,32", 
      "words": [ 
      { 
       "boundingBox": "1659,1007,51,29", 
       "text": "69," 
      }, 
      { 
       "boundingBox": "1720,1004,37,28", 
       "text": "90" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1661,1103,97,35", 
      "words": [ 
      { 
       "boundingBox": "1661,1103,97,35", 
       "text": "49,90" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1644,1309,118,35", 
      "words": [ 
      { 
       "boundingBox": "1644,1309,118,35", 
       "text": "299,00" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1064,1469,620,45", 
     "lines": [ 
     { 
      "boundingBox": "1064,1469,620,45", 
      "words": [ 
      { 
       "boundingBox": "1064,1481,237,33", 
       "text": "-Rabattcheck" 
      }, 
      { 
       "boundingBox": "1324,1486,51,24", 
       "text": "nr:" 
      }, 
      { 
       "boundingBox": "1384,1469,300,38", 
       "text": "935011035567095" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1123,1584,159,82", 
     "lines": [ 
     { 
      "boundingBox": "1123,1584,159,33", 
      "words": [ 
      { 
       "boundingBox": "1123,1584,159,33", 
       "text": "DELSUMMA" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1143,1635,116,31", 
      "words": [ 
      { 
       "boundingBox": "1143,1635,116,31", 
       "text": "Rabatt" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1609,1570,180,189", 
     "lines": [ 
     { 
      "boundingBox": "1609,1570,160,36", 
      "words": [ 
      { 
       "boundingBox": "1609,1575,11,31", 
       "text": "|" 
      }, 
      { 
       "boundingBox": "1648,1570,121,34", 
       "text": "041,70" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1690,1621,99,34", 
      "words": [ 
      { 
       "boundingBox": "1690,1621,99,34", 
       "text": "50,00" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1651,1725,120,34", 
      "words": [ 
      { 
       "boundingBox": "1651,1727,53,32", 
       "text": "991" 
      }, 
      { 
       "boundingBox": "1715,1746,9,13", 
       "text": "," 
      }, 
      { 
       "boundingBox": "1732,1725,39,32", 
       "text": "70" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "992,1737,310,1226", 
     "lines": [ 
     { 
      "boundingBox": "1123,1737,179,35", 
      "words": [ 
      { 
       "boundingBox": "1123,1737,179,35", 
       "text": "SLUTSUMMA" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1036,2756,227,35", 
      "words": [ 
      { 
       "boundingBox": "1036,2756,227,35", 
       "text": "Totalbelopp" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1140,2811,124,37", 
      "words": [ 
      { 
       "boundingBox": "1140,2811,53,35", 
       "text": "991" 
      }, 
      { 
       "boundingBox": "1207,2833,8,15", 
       "text": "/" 
      }, 
      { 
       "boundingBox": "1225,2811,39,34", 
       "text": "70" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "992,2927,271,36", 
      "words": [ 
      { 
       "boundingBox": "992,2928,159,35", 
       "text": "Säljare:" 
      }, 
      { 
       "boundingBox": "1182,2927,81,33", 
       "text": "7688" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1330,2754,145,92", 
     "lines": [ 
     { 
      "boundingBox": "1330,2754,144,34", 
      "words": [ 
      { 
       "boundingBox": "1330,2754,39,33", 
       "text": "Ex" 
      }, 
      { 
       "boundingBox": "1394,2754,80,34", 
       "text": "Moms" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1352,2809,123,37", 
      "words": [ 
      { 
       "boundingBox": "1352,2809,123,37", 
       "text": "793,36" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1563,2752,126,92", 
     "lines": [ 
     { 
      "boundingBox": "1563,2752,125,33", 
      "words": [ 
      { 
       "boundingBox": "1563,2752,82,33", 
       "text": "Moms" 
      }, 
      { 
       "boundingBox": "1670,2755,18,27", 
       "text": "%" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1586,2808,103,36", 
      "words": [ 
      { 
       "boundingBox": "1586,2808,103,36", 
       "text": "25,00" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "1780,2751,123,93", 
     "lines": [ 
     { 
      "boundingBox": "1820,2751,83,33", 
      "words": [ 
      { 
       "boundingBox": "1820,2751,83,33", 
       "text": "Moms" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1780,2807,123,37", 
      "words": [ 
      { 
       "boundingBox": "1780,2807,123,37", 
       "text": "198,34" 
      } 
      ] 
     } 
     ] 
    }, 
    { 
     "boundingBox": "985,2924,966,573", 
     "lines": [ 
     { 
      "boundingBox": "1523,2924,83,33", 
      "words": [ 
      { 
       "boundingBox": "1523,2924,83,33", 
       "text": "7618" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1288,2926,167,33", 
      "words": [ 
      { 
       "boundingBox": "1288,2939,17,7", 
       "text": "-" 
      }, 
      { 
       "boundingBox": "1330,2926,125,33", 
       "text": "Sabina" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1182,2981,468,36", 
      "words": [ 
      { 
       "boundingBox": "1182,2983,38,34", 
       "text": "24" 
      }, 
      { 
       "boundingBox": "1245,2982,146,34", 
       "text": "oktober" 
      }, 
      { 
       "boundingBox": "1416,2982,82,34", 
       "text": "2016" 
      }, 
      { 
       "boundingBox": "1547,2982,10,33", 
       "text": "1" 
      }, 
      { 
       "boundingBox": "1571,2981,79,34", 
       "text": "7:20" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "991,2985,103,33", 
      "words": [ 
      { 
       "boundingBox": "991,2985,103,33", 
       "text": "Datum" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1161,3040,403,34", 
      "words": [ 
      { 
       "boundingBox": "1161,3040,96,34", 
       "text": "44601" 
      }, 
      { 
       "boundingBox": "1288,3040,140,34", 
       "text": "Kvitto:" 
      }, 
      { 
       "boundingBox": "1460,3040,104,34", 
       "text": "51756" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "990,3042,103,33", 
      "words": [ 
      { 
       "boundingBox": "990,3042,103,33", 
       "text": "Kassa" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1096,3157,728,40", 
      "words": [ 
      { 
       "boundingBox": "1096,3159,105,38", 
       "text": "Spara" 
      }, 
      { 
       "boundingBox": "1225,3157,163,39", 
       "text": "kvittot," 
      }, 
      { 
       "boundingBox": "1418,3157,127,39", 
       "text": "gäller" 
      }, 
      { 
       "boundingBox": "1570,3169,63,26", 
       "text": "som" 
      }, 
      { 
       "boundingBox": "1657,3158,167,39", 
       "text": "garanti." 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1268,3217,388,39", 
      "words": [ 
      { 
       "boundingBox": "1268,3217,103,39", 
       "text": "Öppet" 
      }, 
      { 
       "boundingBox": "1397,3218,62,38", 
       "text": "köp" 
      }, 
      { 
       "boundingBox": "1484,3218,41,37", 
       "text": "30" 
      }, 
      { 
       "boundingBox": "1550,3218,106,38", 
       "text": "dager" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1290,3276,317,39", 
      "words": [ 
      { 
       "boundingBox": "1290,3276,192,38", 
       "text": "VÄLKOMMEN" 
      }, 
      { 
       "boundingBox": "1506,3278,101,37", 
       "text": "ÅTER!" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1116,3335,719,42", 
      "words": [ 
      { 
       "boundingBox": "1116,3337,41,36", 
       "text": "Om" 
      }, 
      { 
       "boundingBox": "1182,3335,82,38", 
       "text": "ditt" 
      }, 
      { 
       "boundingBox": "1290,3346,84,28", 
       "text": "namn" 
      }, 
      { 
       "boundingBox": "1398,3337,63,38", 
       "text": "och" 
      }, 
      { 
       "boundingBox": "1485,3349,261,28", 
       "text": "personnummer" 
      }, 
      { 
       "boundingBox": "1771,3338,64,37", 
       "text": "har" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "1032,3395,894,42", 
      "words": [ 
      { 
       "boundingBox": "1032,3397,146,36", 
       "text": "lämnats" 
      }, 
      { 
       "boundingBox": "1204,3395,62,38", 
       "text": "för" 
      }, 
      { 
       "boundingBox": "1290,3395,61,38", 
       "text": "att" 
      }, 
      { 
       "boundingBox": "1377,3399,194,36", 
       "text": "genomföra" 
      }, 
      { 
       "boundingBox": "1596,3399,61,36", 
       "text": "ett" 
      }, 
      { 
       "boundingBox": "1685,3399,241,38", 
       "text": "JulaPro-köp" 
      } 
      ] 
     }, 
     { 
      "boundingBox": "985,3455,966,42", 
      "words": [ 
      { 
       "boundingBox": "985,3456,193,37", 
       "text": "behandlar" 
      }, 
      { 
       "boundingBox": "1203,3455,85,37", 
       "text": "Jula" 
      }, 
      { 
       "boundingBox": "1312,3456,84,37", 
       "text": "dina" 
      }, 
      { 
       "boundingBox": "1421,3458,195,39", 
       "text": "uppgifter" 
      }, 
      { 
       "boundingBox": "1645,3462,12,33", 
       "text": "i" 
      }, 
      { 
       "boundingBox": "1686,3458,173,38", 
       "text": "enlighet" 
      }, 
      { 
       "boundingBox": "1886,3461,65,36", 
       "text": "med" 
      } 
      ] 
     } 
     ] 
    } 
    ] 
} 
+0

あなたは解決策を見つけますか? – RAVI

+0

すべての単語を抽出して1つのリストに入れます。私たちは後にAzureマシン学習を使用して、何が一緒に属しているのか、どのような種類のプロパティであるのかを判断します。しかし、今は別々の単語にバウンディングボックスを使用して、クリック可能な領域を作成するだけで、ユーザーはさまざまなプロパティに対して適切なボックスを選択する必要があります。 – Lillvik

答えて

2

これは、コンピュータビジョンの問題、そのNLP /テキストパターン認識問題ではありません。言い換えれば、OCRのどれもあなたがしたいことをしません。彼らは画像からテキストを抽出するだけです。

多くの異なる種類の領収書を収集し、その構造を調べ、ルールベースのアプローチまたは機械学習ベースのアプローチを使用して、各情報を分類します。クラシファイアは{ItemName、ItemPrice、Subtotal、Total、Heading、Other}のようなカテゴリを持つことができます。境界ボックスを使用してグリッドセルを形成し、隣接するセルをフィーチャとして使用することができます。これは重要な理解であり、高精度の出力を生成するには、優れたMLスキルセットが必要です。

See here for tutorial

また、いくつかのオープンソースプロジェクトを見てみましょう:

+0

私たちはAzure Machine Learningと共にMicrosoftのComputer Visionを使用する予定です。これを達成するためにAzure ML内にモジュールがある場合はどうしますか? – Lillvik

関連する問題