2017-07-12 13 views
0

MSWordドキュメントを入力として受け取り、セクション見出し/数値を取得し、そのセクションのテーブルをJSONオブジェクトとして返すPython関数があります。C#へのPython-DocX関数の移植

このPython関数をC#に変換し、Aspose.Words for .NETライブラリを使用する必要があります。必要に応じてペーストビンでのPythonの関数に

from __future__ import (
    absolute_import, division, print_function, unicode_literals 
) 

import json 
from docx import Document 
from docx.document import Document as _Document 
from docx.oxml.text.paragraph import CT_P 
from docx.oxml.text.run import CT_R 
from docx.oxml.table import CT_Tbl 
from docx.table import _Cell, Table 
from docx.text.paragraph import Paragraph 
from docx.text.run import Run 
#import datetime 
import sys, traceback 
import win_unicode_console 
from colorama import init 
from colorama import Fore, Back, Style 


#outputFile = open('Document-ToJSON.csv', 'w', newline='\n') 
#outputWriter = csv.writer(outputFile) 

gblDocTree = [] 
gblDocListNumber = [] 
gblRowCols = {} 


def init_myGlobals(): 
    #Reinitialize Global Variables 
    #init() 
    win_unicode_console.enable() 
    init() 
    global gblDocTree 
    gblDocTree = [] 
    global gblDocListNumber 
    gblDocListNumber = [] 
    global gblRowCols 
    gblRowCols = {} 
    return 

class tblparam(): 
    def __init__(self,param): 
     self.param = param 

def get_num(x): 
    return int(''.join(ele for ele in x if ele.isdigit())) 

def add_RowCol(rowNumber, rowList): 
    global gblRowCols 
    gblRowCols[rowNumber] = rowList 
    return 

def add_to_sectionnumber(myLocation): 
    global gblDocListNumber 
    myInt = myLocation - 1 

    if myLocation > len(gblDocListNumber) or len(gblDocListNumber) == 0: 
     #initializing 
     gblDocListNumber.append(1) 
    elif len(gblDocListNumber) == myLocation: 
     #if total array len is equal to current heading depth 
     #do this 
     gblDocListNumber[myInt] = gblDocListNumber[myInt] + 1 
    #elif myLocation == 1: 
    # del gblDocListNumber[0:] 
    # gblDocListNumber[0] = gblDocListNumber[0] + 1 
    elif len(gblDocListNumber) > myLocation: 
     #x = len(gblDocListNumber) - myLocation 
     #print("myLocation:{0}, myListCount:{1}".format(myLocation, len(gblDocListNumber))) 
     #Eliminate everything from 
     del gblDocListNumber[myLocation:] 
     gblDocListNumber[myLocation - 1] = gblDocListNumber[myLocation - 1] + 1 
    return 

def add_to_hierarchy(myHeading, myLocation): 
    #Create a String Array holding the Paragraph 
    #Names, and appending them to previous levels 
    #Heading1 > SubHeading2 > SubHeading3 
    global gblDocTree 
    myInt = myLocation - 1 
    if myLocation > len(gblDocTree) or len(gblDocTree) == 0: 
     gblDocTree.append(myHeading) 
    elif len(gblDocTree) == myLocation: 
     gblDocTree[myInt] = myHeading 
    elif myLocation == 1: 
     del gblDocTree[:] 
     gblDocTree.append(myHeading) 
    elif len(gblDocTree) > myLocation: 
     x = len(gblDocTree) - myLocation 
     #print("i'm going to remove -{0}".format(x)) 
     del gblDocTree[myLocation - 1:] 
     gblDocTree.append(myHeading) 
    return 

def iter_block_items(parent): 
    #""" 
    #Generate a reference to each paragraph and table child within *parent*, 
    #in document order. Each returned value is an instance of either Table or 
    #Paragraph. *parent* would most commonly be a reference to a main 
    #Document object, but also works for a _Cell object, which itself can 
    #contain paragraphs and tables. 
    #""" 
    if isinstance(parent, _Document): 
     parent_elm = parent.element.body 
     #print(parent_elm.xml) 
    elif isinstance(parent, _Cell): 
     parent_elm = parent._tc 
    else: 
     raise ValueError("something's not right") 

    for child in parent_elm.iterchildren(): 
     if isinstance(child, CT_P): 
      yield Paragraph(child, parent) 
     elif isinstance(child, CT_Tbl): 
      yield Table(child, parent) 
     #elif isinstance(child, CT_R): 
     # yield Run(child, parent) 

def parseDocX(mydocumentfullpath, startSection): 

    init_myGlobals() #Initialize 

    #Setup variables# 
    myDoc = mydocumentfullpath 
# f = open(outputCSVPath, 'w', newline='') #Python 3, newline='' eliminates extra newlines' in output 
    startSectSet = True 
    try: 

     document = Document(myDoc) 
     prvHeader = '' 
     headerLst = ['Heading 1', 
         'Heading 2', 'Heading 3', 'Heading 4', 
         'Heading 5', 'Heading 6', 'Heading 7', 
         'Heading 8', 'Heading 9', 
         'Egemin1', 'Egemin2', 'Egemin3', 'Egemin4', 
         'Egemin5', 'Egemin6', 'Egemin7', 'Egemin8', 
         'Egemin9', 'Egemin10', 'Egemin11', 'Egemin12'] 
     valNext = False 
     prvIntHeadLv = 0 
     curHeadIntLv = 0 
     curHeadNm = '' 
     curListNm = '' 
     myIntValName = '' 
     myPropCnt = 0 
     sectionJSON = {} 
     paraText = '' 

     for block in iter_block_items(document): 
      #print(block.text if isinstance(block, Paragraph) else '<table>') 
      #print('************************') 
      #print('NEW LOOP') 
      if isinstance(block, Paragraph): 
       #print('In Paragraph') 
       #for myRun in block.runs: 
       # print('Got Runs ?') 
       # print('Run Text :: {0}'.format(myRun.text)) 
        #print('Style :: {0}'.format(myRun.style.name)) 
       #print(block.runs.text) 
       #print(block.text) 
       #print(block.style.name) 
       if block.style.name in headerLst: 

        #New Document Header, so new Section 
        sectionJSON = {} #Reset 
        paraText = '' 

        #Paragraphs contain all doc information. 
        #Using the above array, we're checking for the most commonly used 
        #Section/Paragrah Headers 
        #So we can differentiate what data we are actually processing 
        curHeadIntLv = get_num(block.style.name)  
        add_to_hierarchy(block.text.strip().lower(), curHeadIntLv) 

#BOUTIFY CODE 

        if len(block.text.strip().lower())>0: 
         add_to_sectionnumber(curHeadIntLv) 
        curListNm = '.'.join(map(str, gblDocListNumber)) 
        curHeadNm = "%s %s" % (curListNm, block.text.strip()) 

        #Check if Current Section is greater than required Start 
        if(startSectSet and curListNm!=''): 
         sectionHeading = curHeadNm.lstrip().split(" ")[0] #Use Full Paragraph Header String, to ID true section number 
         curListTuple = tuple([int(x) for x in sectionHeading.split('.')]) 
         reqStartTuple = tuple([int(x) for x in startSection.split('.')]) 
         if (curListTuple < reqStartTuple): 
          continue #Skip iteration 
         elif (curListTuple > reqStartTuple): 
          break #Exit 

        if curHeadIntLv == 1 or prvIntHeadLv == 0: 
         #curHeadNm = block.text.strip().lower() 
         prvIntHeadLv = curHeadIntLv 

        elif curHeadIntLv == prvIntHeadLv: 
         prvIntHeadLv = curHeadIntLv 
         continue 

        elif curHeadIntLv > prvIntHeadLv: 
         prvIntHeadLv = curHeadIntLv 
         continue 

        else: 
         #curHeadNm = block.text.strip().lower() 
         prvIntHeadLv = curHeadIntLv 
         continue 
       else: 
        curParaText = block.text.strip().lower() 
        paraText += block.text.strip().lower().replace("'", "''") 

      elif isinstance(block, Table): #process table rows, for interesting data 

      #Check if Current Section is greater than required Start 
       if(startSectSet and curListNm!=''): 
        sectionHeading = curHeadNm.lstrip().split(" ")[0] #Use Full Paragraph Header String, to ID true section number 
        curListTuple = tuple([int(x) for x in sectionHeading.split('.')]) 
        reqStartTuple = tuple([int(x) for x in startSection.split('.')]) 
        if (curListTuple < reqStartTuple): 
         continue #Skip iteration 
        elif (curListTuple > reqStartTuple): 
         break #exit 
       else: 
        continue   
       #Assuming if @ table, then paragraph Text is all captured 
       #sectionJSON.update({"ParagraphText":paraText}) 

       i = 0 

       if curHeadNm!='': 
        #print("Good Heading") 
        #Try and get the Heading Number 
        sectionHeading = curHeadNm.lstrip().split(" ")[0] 
        #print("Section Check ; {0}".format(sectionHeading)) 
       else: 
        #print("Empty Heading") 
        continue 

       rowsArray = [] 
       headerArray = [] 
       #Process Table, row by row 
       for row in block.rows: 
        #print('Processing section {0}'.format(sectionHeading)) 
        i += 1 
        myCell = 0 
        JSONrow = {} 
        rstList = [] 
        rowStringify = [] 
        if i==1: 
         for row_cell in row.cells: 
          headerArray.append(row_cell.text.strip().lower().replace("'", "''")) 
          continue #Start proper table loop 
        else: 
#     for row_cell in row.cells: 
#      rstList.append(row_cell.text.strip().lower().replace("'", "''")) 
         for x in range(len(headerArray)): 
          #print('Iteration {0}'.format(x)) 
          #print(row.cells[x].text.strip().lower()) 
          rowStringify.append("\"" + headerArray[x] + "\"" + ":" + "\"" + row.cells[x].text.strip().lower().replace("'", "''") + "\"") 
         #Create JSON object, with Array of Columns as Value 
         myStr = ("{" + (','.join(map(str, rowStringify))) + "}") 
         print(u"{}".format(myStr)) 
         #JSONrow = json.loads(myStr.replace('\r', '\\r').replace('\n', '\\n')) 
         JSONrow = json.loads(u"{}".format(myStr)) 
         rowsArray.append(JSONrow) #Now, add JSONrow back to array object 

       sectionJSON.update({"Rows":rowsArray}) 
       break #end now, after fully processing table 
       #print(json.dumps(sectionJSON, indent=4, sort_keys=True)) 

       #writer = csv.writer(f, delimiter=',') 
       #writer.writerow([sectionHeading, json.dumps(sectionJSON)]) 
       #now, back to start 

    except IOError as e: 
     print ('I/O error({0}): {1}'.format(e.errno, e.strerror)) 
#  traceback.print_exc() 
     #return 1 
    except ValueError: 
     print ('Could not convert data to an integer. {0} :: {1}'.format(sys.exc_info()[0], sys.exc_info()[1])) 
#  traceback.print_exc() 
     #return 1 
    except : 
     print ('Unexpected error: {0} :: {1}'.format(sys.exc_info()[0], sys.exc_info()[1])) 
     traceback.print_exc() 
     #return 1 
    finally: 
     #f.close() 
     win_unicode_console.disable() 
     return json.dumps(sectionJSON,sort_keys=True) 
#  globals().clear() 
     #return 0 

#if __name__=='__main__': 
# sys.exit(main(sys.argv[1], sys.argv[2])) 

リンク:https://pastebin.com/vrHkk9s0

答えて

0

私の知っているC#変換サービスへのPythonはありません。ここ

はコードです。

ただし、Pythonは.Netアプリ内から実行できます。

IronPythonを使用すると、実際に.Net CodeをPythonスクリプトに統合することができます。たとえば、Pythonスクリプトの文字列プロパティを設定できます。

また、PythonスクリプトをC#から実行し、Python関数の結果を取得することもできます。

あなたはこのような何かを試すことができます:返信用

static void Main() 
{ 
    var ipy = Python.CreateRuntime(); 
    dynamic parser = ipy.UseFile("WordParser.py"); 
    string docPath = "C:\\SomeFolder\\SomeWord.Docx"; 
    var result = parser.parseDocX(docPath,"somesection); 
} 
+0

やあ、おかげで、私はあなたのアイデアを気に入って、私はそれを実装する最初の試みます。私が望むのは、私のPythonコードと同じようにC#で書かれたコードです。 –

関連する問題