2016-08-26 24 views
0

こんにちは、私は私が私のxmlファイルが解析XML使っSTAXパーサのJava

私は、各ユーザーとコンテンツと会話をしたいだろうグループ に基づいてそれを解析し、それを表示するように助けてください複数の開始タグと大きなxmlファイルを持っていますEXに異なる会話ID場合は1行でファイルに追加するID第2ライン

:出力ファイルは 1 converID userNameに会話+ userNameに会話.... 2 anotherConvID userNameに会話+ userNameの会話であるべきです.. ..

 <?xml version="1.0" encoding="UTF-8" standalone="no"?> 
<!-- Data provided by Bloomberg LP. --> 
<FileDump> 
<Version>IBXML 1.3</Version> 
<Conversation Perspective=" " RoomType="P"> 
<RoomID>PCHAT-0x3000001CA8361</RoomID> 
<StartTime>03/31/2016 13:39:01</StartTime> 
<StartTimeUTC>1459431541</StartTimeUTC> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 13:39:01</DateTime> 
<DateTimeUTC>1459431541</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 14:56:22</DateTime> 
<DateTimeUTC>1459436182</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantLeft InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 19:30:01</DateTime> 
<DateTimeUTC>1459452601</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantLeft> 
<ParticipantLeft InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 19:33:56</DateTime> 
<DateTimeUTC>1459452836</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantLeft> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 19:45:16</DateTime> 
<DateTimeUTC>1459453516</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantLeft InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 23:08:09</DateTime> 
<DateTimeUTC>1459465689</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantLeft> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 23:14:23</DateTime> 
<DateTimeUTC>1459466063</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<Message InteractionType="N"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:10:57</DateTime> 
<DateTimeUTC>1459469457</DateTimeUTC> 
<Content> 
abcdefgghhhhhh 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>WVU</LoginName> 
<FirstName>WHEELOCK</FirstName> 
<LastName>VU</LastName> 
<UUID>8266852</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:14:05</DateTime> 
<DateTimeUTC>1459469645</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantEntered InteractionType="N"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:29:19</DateTime> 
<DateTimeUTC>1459470559</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<Message InteractionType="N"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:29:19</DateTime> 
<DateTimeUTC>1459470559</DateTimeUTC> 
<Content> 
ajdakjgdljsgdsafhkafa 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:29:19</DateTime> 
<DateTimeUTC>1459470559</DateTimeUTC> 
<Content> 
akjdgljsafdlshf;kdsjf 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N"> 
<User> 
<LoginName>WVU</LoginName> 
<FirstName>WHEELOCK</FirstName> 
<LastName>VU</LastName> 
<UUID>8266852</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:39:32</DateTime> 
<DateTimeUTC>1459471172</DateTimeUTC> 
<Content> 
sagdksajdlsahd 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 01:01:27</DateTime> 
<DateTimeUTC>1459472487</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<Message InteractionType="N"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 01:31:29</DateTime> 
<DateTimeUTC>1459474289</DateTimeUTC> 
<Content> 
ajdslsahdsj;a 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 02:49:46</DateTime> 
<DateTimeUTC>1459478986</DateTimeUTC> 
<Content> 
sagdkjsagdkjashdlasjd 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 02:49:46</DateTime> 
<DateTimeUTC>1459478986</DateTimeUTC> 
<Content> 
jsdhkshdksjdlsjdlks 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 03:47:37</DateTime> 
<DateTimeUTC>1459482457</DateTimeUTC> 
<Content> 
jshdkshdksjdlskld 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 03:47:37</DateTime> 
<DateTimeUTC>1459482457</DateTimeUTC> 
<Content> 
aasasasasas 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<EndTime>04/01/2016 03:47:37</EndTime> 
<EndTimeUTC>1459482457</EndTimeUTC> 
</Conversation> 
</FileDump> 
+0

はそれがあるので、plsは –

答えて

1

すべての "Content"、 "LoginName"、 "ConversationID"ノードのテキストがメモリに収まる場合、その解決策は以下のようになります(input.xmlから読み取り、output.txtに書き込みます。また、あなたはあなたの行番号1,2、...を指定し、質問で指定したように "+"記号で異なるメッセージからデータを区切りたいとします)。

しかし、そのデータがメモリに収まらない場合は、例えばStAXをConversationID、LoginName、Contentの形式でファイルに展開し、次にファイルをin external memory(ディスクまたは連続した行を同じConversationIDでマージします。または、最初のXMLを複数に分割し、それぞれを次のコードを使用して処理します。それでも結果のファイルをマージする必要がありますが、それは簡単かもしれません。

import java.io.FileInputStream; 
import java.io.FileWriter; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.HashSet; 
import java.util.List; 
import java.util.Set; 

import javax.xml.stream.XMLInputFactory; 
import javax.xml.stream.XMLStreamConstants; 
import javax.xml.stream.XMLStreamException; 
import javax.xml.stream.XMLStreamReader; 

import org.apache.commons.lang3.StringUtils; 

public class Solution { 

    private static final String ROOM_ID = "RoomID"; 
    private static final String CONTENT = "Content"; 
    private static final String LOGIN_NAME = "LoginName"; 
    private static final String CONVERSATION_ID = "ConversationID"; 
    private static final String FILE_DUMP = "FileDump"; 
    private static final String MESSAGE = "Message"; 
    private static final String CONVERSATION = "Conversation"; 
    private static final String START_TIME = "StartTime"; 

    static class ConversationInfo { 
     private String startTimeStr; 

     private String conversationId; 

     private final Set<String> users = new HashSet<>(); 

     private final List<Message> messages = new ArrayList<>(); 

     @Override 
     public String toString() { 
      return String.format("%s %s (%d) %s", startTimeStr, conversationId, users.size(), 
       StringUtils.join(messages, " + ")); 
     } 
    } 

    static class Message { 

     public final String userName; 

     public final String content; 

     public Message(String name, String content) { 
      this.userName = name; 
      this.content = content; 
     } 

     @Override 
     public String toString() { 
      return userName + " " + content; 
     } 
    } 

    public static void main(String[] args) 
      throws XMLStreamException, IOException { 
     XMLInputFactory xf = XMLInputFactory.newFactory(); 
     List<ConversationInfo> m = new ArrayList<>(); 
     try (FileInputStream fin = new FileInputStream("input.xml")) { 
      XMLStreamReader xr = xf.createXMLStreamReader(fin); 
      LOOP: while (xr.hasNext()) { 
       int event = xr.next(); 
       switch (event) { 
        case XMLStreamConstants.START_ELEMENT: { 
         String elName = xr.getLocalName(); 
         if (CONVERSATION.equals(elName)) { 
          ConversationInfo convInfo = parseConversation(xr); 
          if (convInfo != null) { 
           m.add(convInfo); 
          } 
         } 
         break; 
        } 
        case XMLStreamConstants.END_ELEMENT: { 
         String elName = xr.getLocalName(); 
         if (FILE_DUMP.equals(elName)) { 
          break LOOP; 
         } 
         break; 
        } 
        case XMLStreamConstants.END_DOCUMENT: 
         throw new IllegalStateException("xml not well-formed: <" 
          + FILE_DUMP + "> tag not closed"); 
       } 
      } 
     } 
     try (FileWriter w = new FileWriter("output.txt")) { 
      int i = 1; 
      for (ConversationInfo convInfo : m) { 
       w.write(String.format("%d %s\n", i++, convInfo)); 
      } 
     } 
    } 

    private static ConversationInfo parseConversation(XMLStreamReader xr) 
      throws XMLStreamException { 
     ConversationInfo convInfo = new ConversationInfo(); 
     while (xr.hasNext()) { 
      int event = xr.next(); 
      switch (event) { 
       case XMLStreamConstants.START_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (MESSAGE.equals(elName)) { 
         Message message = parseMessage(xr); 
         if (message != null) { 
          convInfo.messages.add(message); 
          convInfo.users.add(message.userName); 
         } 
        } else if (START_TIME.equals(elName)) { 
         convInfo.startTimeStr = xr.getElementText(); 
        } else if (ROOM_ID.equals(elName)) { 
         convInfo.conversationId = xr.getElementText(); 
        } 
        break; 
       } 
       case XMLStreamConstants.END_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (CONVERSATION.equals(elName)) { 
         return convInfo; 
        } 
        break; 
       } 
       case XMLStreamConstants.END_DOCUMENT: 
        throw new XMLStreamException("xml not well-formed: <" 
         + CONVERSATION + "> tag not closed"); 
      } 
     } 
     throw new XMLStreamException(
      "unexpected end of xml file while parsing a conversation"); 
    } 

    private static Message parseMessage(XMLStreamReader xr) 
      throws XMLStreamException { 
     String userName = null; 
     String content = null; 
     while (xr.hasNext()) { 
      int event = xr.next(); 
      switch (event) { 
       case XMLStreamConstants.START_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (LOGIN_NAME.equals(elName)) { 
         userName = xr.getElementText(); 
        } else if (CONTENT.equals(elName)) { 
         content = StringUtils.trimToEmpty(xr.getElementText()); 
        } 
        break; 
       } 
       case XMLStreamConstants.END_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (MESSAGE.equals(elName)) { 
         return new Message(userName, content); 
        } 
        break; 
       } 
       case XMLStreamConstants.END_DOCUMENT: 
        throw new XMLStreamException("xml not well-formed: <" 
         + MESSAGE + "> tag not closed"); 
      } 
     } 
     throw new XMLStreamException(
      "unexpected end of xml file while parsing a message"); 
    } 
} 
+0

Multimapははグアバライブラリー、今のよう19 – starikoff

+0

となっている最新のバージョンからです助け、私は全くMultimapはとグァバを使用しないソリューションを更新し、入力ファイルと必要ないくつかの変更を変更しました – starikoff

+0

このXMLのデータ関係に関する質問があります。 1)会話(<会話> ...)内のすべてのメッセージ(内部にあるものは ...)はその会話にのみ属していると仮定できますか? 2)会話内にネストされたメッセージのConversationIDがそのRoomIDと等しいと仮定できますか? 3)ファイル内に重複したRoomIDを持つ会話がないと仮定できますか? – starikoff