ありがとうございました;
ここに(重いが動作している)回答があります。
その後
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using iTextSharp.text.pdf;
// PdfTextExtractor
using iTextSharp.text.pdf.parser;
namespace PdfParsingiTextSharp {
マーカー収集のためのコードのサンプル...
/*
* marker element, in order to build a collection
*/
public class cMark : IComparable {
public enum TypeMarker{
TypeSignet,
TypeAnnotation
};
public enum TypeAnnotationSubType{
TypeAnnotation_NONE,
TypeAnnotation_UNDERLINE,
TypeAnnotation_HIGHLIGHT,
TypeAnnotation_STRIKEOUT,
TypeAnnotation_SQUIGGLY
};
public TypeMarker eType;
public TypeAnnotationSubType eAnnotationSubType;
// level of signet
public int signetLevel;
// page in document
public int pageNum;
// indirect reference of page
public int pageRef;
// text of signet or annotation
public String title;
// area rectangle of annotation
public iTextSharp.text.Rectangle annotRect;
public cMark(TypeMarker p_eType, TypeAnnotationSubType p_TypeAnnotationSubType) {
eType = p_eType;
eAnnotationSubType = p_TypeAnnotationSubType;
signetLevel = -1;
pageNum = -1;
pageRef = -1;
title = "";
annotRect = null;
}
/**
* compare first on page, then on row, and finaly on column
*/
public int CompareTo(object obj) {
cMarker compareObj = (cMarker)obj;
int pageTest = compareObj.pageNum.CompareTo(this.pageNum);
if (pageTest != 0) {
return pageTest;
}
else {
if (annotRect == null) {
return 0;
}
else {
int rowTest = compareObj.annotRect.Top.CompareTo(this.annotRect.Top);
if (rowTest != 0) {
return rowTest;
}
else {
return compareObj.annotRect.Left.CompareTo(this.annotRect.Left);
}
}
}
}
}
、その後の解析の注釈。
// parsing annotation in document
public static class Demo {
/* Parse PDf file annotations
*/
static void parseAnnotations(PdfReader reader, List<cMark> markers) {
markers.Clear();
// on each page
for(int pg = 1; pg < reader.NumberOfPages+1; pg++) {
PdfDictionary pagedic = reader.GetPageN(pg);
// get annotations array
PdfArray annotarray = (PdfArray)PdfReader.GetPdfObject(pagedic.Get(PdfName.ANNOTS));
// if no annotation ...
if (annotarray == null || annotarray.Size == 0) {
continue;
}
// on each annotation reference...
foreach(PdfIndirectReference annot in annotarray.ArrayList) {
PdfDictionary annotationDic = (PdfDictionary)PdfReader.GetPdfObject(annot);
PdfName subType = (PdfName)annotationDic.Get(PdfName.SUBTYPE);
PdfString contents = annotationDic.GetAsString(PdfName.CONTENTS);
// if simple text...
if ( (contents != null) &&
( (subType.Equals(PdfName.TEXT)) ||
(subType.Equals(PdfName.FREETEXT))
)
) {
String value = contents.ToString();
// single marker element
cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
mrk.pageNum = pg;
mrk.title = value;
if (annotationDic.Get(PdfName.RECT) != null) {
PdfArray coord = annotationDic.GetAsArray(PdfName.RECT);
PdfRectangle textRect = new PdfRectangle(
((PdfNumber)coord[0]).FloatValue,
((PdfNumber)coord[1]).FloatValue,
((PdfNumber)coord[2]).FloatValue,
((PdfNumber)coord[3]).FloatValue);
mrk.annotRect = textRect.Rectangle;
}
markers.Add(mrk);
}
// if decorated text...
if ( (subType.Equals(PdfName.UNDERLINE)) ||
(subType.Equals(PdfName.HIGHLIGHT)) ||
(subType.Equals(PdfName.STRIKEOUT)) ||
(subType.Equals(PdfName.SQUIGGLY))) {
cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
mrk.pageNum = pg;
if (subType.Equals(PdfName.UNDERLINE)) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_UNDERLINE;
}
else if (subType.Equals(PdfName.HIGHLIGHT)) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_HIGHLIGHT;
}
else if (subType.Equals(PdfName.STRIKEOUT)) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_STRIKEOUT;
}
else if (subType.Equals(PdfName.SQUIGGLY)) {
mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_SQUIGGLY;
}
PdfObject pdfObjectQuad = annotationDic.Get(PdfName.QUADPOINTS);
if (pdfObjectQuad != null) {
PdfArray rect = annotationDic.GetAsArray(PdfName.QUADPOINTS);
// float llx, float lly, float urx, float ury
float lowX = Math.Min(((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
lowX = Math.Min(lowX, ((PdfNumber)rect[4]).FloatValue);
lowX = Math.Min(lowX, ((PdfNumber)rect[6]).FloatValue);
float lowY = Math.Min(((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
lowY = Math.Min(lowY, ((PdfNumber)rect[5]).FloatValue);
lowY = Math.Min(lowY, ((PdfNumber)rect[7]).FloatValue);
float upX = Math.Max(((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
upX = Math.Max(upX, ((PdfNumber)rect[4]).FloatValue);
upX = Math.Max(upX, ((PdfNumber)rect[6]).FloatValue);
float upY = Math.Max(((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
upY = Math.Max(upY, ((PdfNumber)rect[5]).FloatValue);
upY = Math.Max(upY, ((PdfNumber)rect[7]).FloatValue);
PdfRectangle textRect = new PdfRectangle(lowX, lowY, upX, upY);
RenderFilter[] filter = { new RegionTextRenderFilter(textRect.Rectangle) };
ITextExtractionStrategy strategy;
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++) {
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
}
String result = sb.ToString();
mrk.title = result;
mrk.annotRect = textRect.Rectangle;
markers.Add(mrk);
}
}
}
}
}
}
PDFを投稿してください。 –
質問は不明です。アノテーション辞書のキーは一覧表示されますが、その値は表示されません。あなたは「対応する純粋なテキスト」と言うとき、どういう意味ですか?あなたは注釈の '/ Contents'を探していますか?あるいは、 '/ QuadPoints'によって定義された実際のページコンテンツ(ページストリーム内)を探していますか?あなたの質問は現在の状態では答えられません。それは "不明確"として閉じられるべきです。 –