3#ifndef PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
4#define PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
9#include <pdfsdk/cxx/math.h>
12#include "wrapper_base.h"
19enum PageTextSearchFlagsEnum {
20 kPageTextSearchIgnoreCase = 1 << 0,
21 kPageTextSearchWholeWord = 1 << 1,
24using PageTextSearchFlags = uint32_t;
42class PageText :
public detail::RefCountedHandle<PDPageText> {
75 std::vector<PageTextRange>
Search(
const std::wstring& text, PageTextSearchFlags flags = 0)
const;
77 PDF_CXX_CORE_WRAPPER_DEFINE_MEMBERS_(
PageText, PDPageText)
149 : m_pagetext(pagetext), m_charindex(charindex) {
159 PDF_CHECK_SUCCESS_X(PDPageTextGetNumChars(m_handle, &numchars));
170 if (quad.Contains(point))
173 return kPDPageIndexNull;
176inline std::vector<PageTextRange>
PageText::Search(
const std::wstring& text, PageTextSearchFlags flags)
const {
179 std::vector<PageTextRange> results;
180 std::wstring pattern = text;
181 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](
wchar_t ch) { return std::iswspace(ch) ? L
'\0' : ch; });
182 if ((flags & kPageTextSearchIgnoreCase) != 0)
183 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](
wchar_t ch) { return std::towlower(ch); });
186 if (0 == numchars || numchars < pattern.size())
189 size_t patternsize = pattern.size();
191 std::vector<int> kmptable(patternsize);
193 if (patternsize > 1) {
195 if (patternsize > 2) {
198 while (pos < patternsize) {
199 if (pattern[pos - 1] == pattern[cnd])
200 kmptable[pos++] = ++cnd;
211 size_t currentindex = 0;
212 size_t matchindex = currentindex;
214 while (currentindex < numchars) {
216 if ((flags & kPageTextSearchIgnoreCase) != 0)
217 std::transform(u.begin(), u.end(), u.begin(), [&](
wchar_t ch) { return std::towlower(ch); });
219 if (std::iswspace(u[0]))
222 size_t checksize = (std::min)(patternsize - kmpindex, u.size());
224 if (!std::memcmp(&pattern[kmpindex], u.data(), checksize *
sizeof(
wchar_t))) {
225 kmpindex +=
static_cast<int>(checksize);
226 if (kmpindex ==
static_cast<int>(patternsize)) {
229 size_t resbegindex = matchindex;
230 size_t resendindex = currentindex;
233 currentindex = ++matchindex;
236 if ((flags & kPageTextSearchWholeWord) != 0) {
237 std::wstring prefix = resbegindex > 0 ?
GetChar(resbegindex - 1).
GetUnicode() : std::wstring{};
238 std::wstring suffix = resendindex < numchars ?
GetChar(resendindex).
GetUnicode() : std::wstring{};
239 auto isDelimiter = [](
wchar_t ch) {
240 return ch ==
'\0' || std::iswspace(ch) || std::iswpunct(ch);
242 if (!(prefix.empty() || isDelimiter(prefix.front())) ||
243 !(suffix.empty() || isDelimiter(suffix.front())))
248 results.push_back({resbegindex, resendindex});
254 if (kmptable[kmpindex] >= 0) {
255 matchindex += kmpindex - kmptable[kmpindex];
256 kmpindex = kmptable[kmpindex];
262 currentindex = matchindex + kmpindex;
271 PDF_CHECK_SUCCESS_X(PDPageTextGetCharQuad(m_pagetext.get(), m_charindex, &quad));
276 return detail::GetWstringProperty(PDPageTextGetCharUnicode, m_pagetext.get(), m_charindex);
281 PDF_CHECK_SUCCESS_X(PDPageTextGetCharRotate(m_pagetext.get(), m_charindex, &rotate));
287 PDF_CHECK_SUCCESS_X(PDPageTextGetFontInfo(m_pagetext.get(), m_charindex, &fontinfo));
293 PDF_CHECK_SUCCESS_X(PDPageTextGetFontSize(m_pagetext.get(), m_charindex, &fontsize));
298 PDColorValue fontcolor = 0;
299 PDF_CHECK_SUCCESS_X(PDPageTextGetFontColor(m_pagetext.get(), m_charindex, &fontcolor));
308 using iterator_category = std::input_iterator_tag;
310 using difference_type = int32_t;
505using PageTextReverseIterator = std::reverse_iterator<PageTextIterator>;
508 : m_pagetext(nullptr), m_charindex(0), m_numchars(0) {
512 : m_pagetext(pagetext.get()), m_charindex(charindex) {
515 PDF_CHECK(charindex <= m_numchars,
kPDErrOutOfRange,
"Char index is out of range");
519 : m_pagetext(rhs.m_pagetext), m_charindex(rhs.m_charindex), m_numchars(rhs.m_numchars) {
526 PDF_CHECK(m_charindex + offset <= m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
527 m_charindex += offset;
540 m_pagetext = rhs.m_pagetext;
541 m_charindex = rhs.m_charindex;
542 m_numchars = rhs.m_numchars;
547 PDF_CHECK(m_charindex < m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
553 PDF_CHECK(m_charindex < m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
559 PDF_CHECK(m_charindex > 0,
kPDErrBadParam,
"Iterator is not decrementable");
565 PDF_CHECK(m_charindex > 0,
kPDErrBadParam,
"Iterator is not decrementable");
574 PDF_CHECK(m_charindex + offset <= m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
576 m_charindex += offset;
593 PDF_CHECK(m_charindex - offset >= 0,
kPDErrBadParam,
"Iterator is not decrementable");
594 m_charindex -= offset;
608 return m_pagetext == rhs.m_pagetext && m_charindex == rhs.m_charindex;
620 PDF_CHECK(m_pagetext == rhs.m_pagetext,
kPDErrBadParam,
"Iterators are not comparable - different pages");
621 return m_charindex < rhs.m_charindex;
657 return m_pagetext.
GetChar(m_charindex);
660inline PageTextChar PageTextIterator::operator*()
const {
662 return m_pagetext.
GetChar(m_charindex);
Represents a character in a PageText.
Definition page_text.h:83
PDFontInfo GetFontInfo() const
Get the font information of a character. This includes the font family, style and format.
Definition page_text.h:285
std::wstring GetUnicode() const
Get the Unicode representation of a character.
Definition page_text.h:275
PDColorValue GetFontColor() const
Get the font color of a character.
Definition page_text.h:297
float GetRotate() const
Get the rotation angle of a character within a text element.
Definition page_text.h:279
float GetFontSize() const
Get the font size of a character.
Definition page_text.h:291
PDF::Quad GetQuad() const
Get the quad (bounding box) of a character.
Definition page_text.h:269
Represents text on a page.
Definition page_text.h:42
std::vector< PageTextRange > Search(const std::wstring &text, PageTextSearchFlags flags=0) const
Search for the specified text throughout all text within a page.
Definition page_text.h:176
PageTextChar GetChar(size_t charindex) const
Get the character at the specified index.
Definition page_text.h:163
size_t HitTest(const PDF::PointF &point) const
Perform a hit test at the specified point and return the index of the character that contains the poi...
Definition page_text.h:167
size_t GetNumChars() const
Get the number of characters from a page, text segment or a word.
Definition page_text.h:157
An iterator for iterating over the characters in a PageText object.
Definition page_text.h:306
bool operator>(const PageTextIterator &rhs) const
Greater than operator.
Definition page_text.h:632
PageText GetPageText() const
Get the PageText object.
Definition page_text.h:648
PageTextIterator operator-(difference_type offset) const
Subtraction operator.
Definition page_text.h:598
PageTextIterator & operator=(const PageTextIterator &rhs)
Assignment operator.
Definition page_text.h:539
bool operator<=(const PageTextIterator &rhs) const
Less than or equal to operator.
Definition page_text.h:628
bool operator!=(const PageTextIterator &rhs) const
Inequality operator.
Definition page_text.h:615
PageTextIterator & operator-=(difference_type offset)
Subtraction assignment operator.
Definition page_text.h:589
bool operator==(const PageTextIterator &rhs) const
Equality operator.
Definition page_text.h:611
PageTextIterator & Prev()
Move the iterator to the previous character.
Definition page_text.h:535
PageTextIterator & Next()
Move the iterator to the next character.
Definition page_text.h:531
PageTextIterator()
Default constructor.
Definition page_text.h:507
bool operator<(const PageTextIterator &rhs) const
Less than operator.
Definition page_text.h:624
PageTextIterator & operator--()
Pre-decrement operator.
Definition page_text.h:558
PageTextIterator operator+(difference_type offset) const
Addition operator.
Definition page_text.h:580
bool Equals(const PageTextIterator &rhs) const
Equality comparison.
Definition page_text.h:607
PageTextIterator & Advance(int offset)
Advance the iterator by the specified offset.
Definition page_text.h:522
PageTextIterator end() const
Get the ending iterator.
Definition page_text.h:644
size_t GetCharIndex() const
Get the character index.
Definition page_text.h:652
bool operator>=(const PageTextIterator &rhs) const
Greater than or equal to operator.
Definition page_text.h:636
PageTextChar GetChar() const
Get the character at the current iterator position.
Definition page_text.h:656
PageTextIterator & operator++()
Pre-increment operator.
Definition page_text.h:546
PageTextIterator begin() const
Get the beginning iterator.
Definition page_text.h:640
PageTextIterator & operator+=(difference_type offset)
Addition assignment operator.
Definition page_text.h:570
bool LessThen(const PageTextIterator &rhs) const
Less than comparison.
Definition page_text.h:619
@ kPDErrBadParam
Bad input parameter.
Definition errors.h:20
@ kPDErrOutOfRange
Value out of range.
Definition errors.h:22
Represents a range of text on a page.
Definition page_text.h:29
size_t endindex
Definition page_text.h:31
size_t begindex
Definition page_text.h:30