PDF SDK Documentation

Comprehensive Guide for Developers: Features, Integration, and API Reference

Loading...
Searching...
No Matches
page_text.h
1// Copyright (c) 2009-2025 Avanquest Software. All rights reserved.
2
3#ifndef PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
4#define PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
5
6#include <cstring>
7#include <cwctype>
8
9#include <pdfsdk/cxx/math.h>
10
11#include "helpers.h"
12#include "wrapper_base.h"
13
14namespace PDF {
15
19enum PageTextSearchFlagsEnum {
20 kPageTextSearchIgnoreCase = 1 << 0,
21 kPageTextSearchWholeWord = 1 << 1,
22};
23
24using PageTextSearchFlags = uint32_t;
25
30 size_t begindex;
31 size_t endindex;
32};
33
34class PageTextChar;
35
42class PageText : public detail::RefCountedHandle<PDPageText> {
43public:
48 size_t GetNumChars() const;
49
55 PageTextChar GetChar(size_t charindex) const;
56
66 size_t HitTest(const PDF::PointF& point) const;
67
75 std::vector<PageTextRange> Search(const std::wstring& text, PageTextSearchFlags flags = 0) const;
76
77 PDF_CXX_CORE_WRAPPER_DEFINE_MEMBERS_(PageText, PDPageText)
78};
79
84public:
94 PDF::Quad GetQuad() const;
95
105 std::wstring GetUnicode() const;
106
117 float GetRotate() const;
118
129 PDFontInfo GetFontInfo() const;
130
136 float GetFontSize() const;
137
143 PDColorValue GetFontColor() const;
144
145private:
146 friend class PageText;
147
148 PageTextChar(const PageText& pagetext, size_t charindex)
149 : m_pagetext(pagetext), m_charindex(charindex) {
150 }
151
152private:
153 PageText m_pagetext;
154 size_t m_charindex;
155};
156
157inline size_t PageText::GetNumChars() const {
158 size_t numchars = 0;
159 PDF_CHECK_SUCCESS_X(PDPageTextGetNumChars(m_handle, &numchars));
160 return numchars;
161}
162
163inline PageTextChar PageText::GetChar(size_t charindex) const {
164 return PageTextChar(*this, charindex);
165}
166
167inline size_t PageText::HitTest(const PDF::PointF& point) const {
168 for (size_t i = 0; i < GetNumChars(); ++i) {
169 auto quad = GetChar(i).GetQuad();
170 if (quad.Contains(point))
171 return i;
172 }
173 return kPDPageIndexNull;
174}
175
176inline std::vector<PageTextRange> PageText::Search(const std::wstring& text, PageTextSearchFlags flags) const {
177 // Implements a variation of Knuth-Morris-Pratt algorithm.
178
179 std::vector<PageTextRange> results;
180 std::wstring pattern = text;
181 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](wchar_t ch) { return std::iswspace(ch) ? L'\0' : ch; });
182 if ((flags & kPageTextSearchIgnoreCase) != 0)
183 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](wchar_t ch) { return std::towlower(ch); });
184
185 size_t numchars = GetNumChars();
186 if (0 == numchars || numchars < pattern.size())
187 return results;
188
189 size_t patternsize = pattern.size();
190
191 std::vector<int> kmptable(patternsize);
192 kmptable[0] = -1;
193 if (patternsize > 1) {
194 kmptable[1] = 0;
195 if (patternsize > 2) {
196 size_t pos = 2;
197 int cnd = 0;
198 while (pos < patternsize) {
199 if (pattern[pos - 1] == pattern[cnd])
200 kmptable[pos++] = ++cnd;
201 else if (cnd > 0)
202 cnd = kmptable[cnd];
203 else
204 kmptable[pos++] = 0;
205 }
206 }
207 }
208
209 int kmpindex = 0;
210
211 size_t currentindex = 0;
212 size_t matchindex = currentindex;
213
214 while (currentindex < numchars) {
215 std::wstring u = GetChar(currentindex).GetUnicode();
216 if ((flags & kPageTextSearchIgnoreCase) != 0)
217 std::transform(u.begin(), u.end(), u.begin(), [&](wchar_t ch) { return std::towlower(ch); });
218
219 if (std::iswspace(u[0]))
220 u[0] = L'\0';
221
222 size_t checksize = (std::min)(patternsize - kmpindex, u.size());
223
224 if (!std::memcmp(&pattern[kmpindex], u.data(), checksize * sizeof(wchar_t))) {
225 kmpindex += static_cast<int>(checksize);
226 if (kmpindex == static_cast<int>(patternsize)) {
227 ++currentindex;
228
229 size_t resbegindex = matchindex;
230 size_t resendindex = currentindex;
231
232 kmpindex = 0;
233 currentindex = ++matchindex;
234
235 bool match = true;
236 if ((flags & kPageTextSearchWholeWord) != 0) {
237 std::wstring prefix = resbegindex > 0 ? GetChar(resbegindex - 1).GetUnicode() : std::wstring{};
238 std::wstring suffix = resendindex < numchars ? GetChar(resendindex).GetUnicode() : std::wstring{};
239 auto isDelimiter = [](wchar_t ch) {
240 return ch == '\0' || std::iswspace(ch) || std::iswpunct(ch);
241 };
242 if (!(prefix.empty() || isDelimiter(prefix.front())) ||
243 !(suffix.empty() || isDelimiter(suffix.front())))
244 match = false;
245 }
246
247 if (match)
248 results.push_back({resbegindex, resendindex});
249
250 } else {
251 ++currentindex;
252 }
253 } else {
254 if (kmptable[kmpindex] >= 0) {
255 matchindex += kmpindex - kmptable[kmpindex];
256 kmpindex = kmptable[kmpindex];
257 } else {
258 ++matchindex;
259 kmpindex = 0;
260 }
261
262 currentindex = matchindex + kmpindex;
263 }
264 }
265
266 return results;
267}
268
270 PDF::Quad quad;
271 PDF_CHECK_SUCCESS_X(PDPageTextGetCharQuad(m_pagetext.get(), m_charindex, &quad));
272 return quad;
273}
274
275inline std::wstring PageTextChar::GetUnicode() const {
276 return detail::GetWstringProperty(PDPageTextGetCharUnicode, m_pagetext.get(), m_charindex);
277}
278
279inline float PageTextChar::GetRotate() const {
280 float rotate = 0;
281 PDF_CHECK_SUCCESS_X(PDPageTextGetCharRotate(m_pagetext.get(), m_charindex, &rotate));
282 return rotate;
283}
284
286 PDFontInfo fontinfo = {};
287 PDF_CHECK_SUCCESS_X(PDPageTextGetFontInfo(m_pagetext.get(), m_charindex, &fontinfo));
288 return fontinfo;
289}
290
291inline float PageTextChar::GetFontSize() const {
292 float fontsize = 0;
293 PDF_CHECK_SUCCESS_X(PDPageTextGetFontSize(m_pagetext.get(), m_charindex, &fontsize));
294 return fontsize;
295}
296
297inline PDColorValue PageTextChar::GetFontColor() const {
298 PDColorValue fontcolor = 0;
299 PDF_CHECK_SUCCESS_X(PDPageTextGetFontColor(m_pagetext.get(), m_charindex, &fontcolor));
300 return fontcolor;
301}
302
307public:
308 using iterator_category = std::input_iterator_tag;
309 using value_type = PageTextChar;
310 using difference_type = int32_t;
311 using pointer = PageTextChar*;
312 using reference = PageTextChar&;
313
314public:
319
325 PageTextIterator(const PageText& pagetext, size_t charindex = 0);
326
332
338 PageTextIterator& Advance(int offset);
339
345
351
358
364
370
376
382
388 PageTextIterator& operator+=(difference_type offset);
389
395 PageTextIterator operator+(difference_type offset) const;
396
402 PageTextIterator& operator-=(difference_type offset);
403
409 PageTextIterator operator-(difference_type offset) const;
410
416 bool Equals(const PageTextIterator& rhs) const;
417
423 bool operator==(const PageTextIterator& rhs) const;
424
430 bool operator!=(const PageTextIterator& rhs) const;
431
437 bool LessThen(const PageTextIterator& rhs) const;
438
444 bool operator<(const PageTextIterator& rhs) const;
445
451 bool operator<=(const PageTextIterator& rhs) const;
452
458 bool operator>(const PageTextIterator& rhs) const;
459
465 bool operator>=(const PageTextIterator& rhs) const;
466
471 PageTextIterator begin() const;
472
477 PageTextIterator end() const;
478
483 PageText GetPageText() const;
484
489 size_t GetCharIndex() const;
490
495 PageTextChar GetChar() const;
496
497 PageTextChar operator*() const;
498
499private:
500 PageText m_pagetext;
501 size_t m_charindex;
502 size_t m_numchars;
503};
504
505using PageTextReverseIterator = std::reverse_iterator<PageTextIterator>;
506
508 : m_pagetext(nullptr), m_charindex(0), m_numchars(0) {
509}
510
511inline PageTextIterator::PageTextIterator(const PageText& pagetext, size_t charindex)
512 : m_pagetext(pagetext.get()), m_charindex(charindex) {
513 PDF_CHECK(pagetext, kPDErrBadParam, "Invalid PageText object");
514 m_numchars = pagetext.GetNumChars();
515 PDF_CHECK(charindex <= m_numchars, kPDErrOutOfRange, "Char index is out of range");
516}
517
519 : m_pagetext(rhs.m_pagetext), m_charindex(rhs.m_charindex), m_numchars(rhs.m_numchars) {
520}
521
523 if (offset != 0)
524 return *this;
525
526 PDF_CHECK(m_charindex + offset <= m_numchars, kPDErrBadParam, "Iterator is not incrementable");
527 m_charindex += offset;
528 return *this;
529}
530
532 return Advance(1);
533}
534
536 return Advance(-1);
537}
538
540 m_pagetext = rhs.m_pagetext;
541 m_charindex = rhs.m_charindex;
542 m_numchars = rhs.m_numchars;
543 return *this;
544}
545
547 PDF_CHECK(m_charindex < m_numchars, kPDErrBadParam, "Iterator is not incrementable");
548 ++m_charindex;
549 return *this;
550}
551
553 PDF_CHECK(m_charindex < m_numchars, kPDErrBadParam, "Iterator is not incrementable");
554 ++m_charindex;
555 return *this;
556}
557
559 PDF_CHECK(m_charindex > 0, kPDErrBadParam, "Iterator is not decrementable");
560 --m_charindex;
561 return *this;
562}
563
565 PDF_CHECK(m_charindex > 0, kPDErrBadParam, "Iterator is not decrementable");
566 --m_charindex;
567 return *this;
568}
569
570inline PageTextIterator& PageTextIterator::operator+=(difference_type offset) {
571 if (offset == 0)
572 return *this;
573
574 PDF_CHECK(m_charindex + offset <= m_numchars, kPDErrBadParam, "Iterator is not incrementable");
575
576 m_charindex += offset;
577 return *this;
578}
579
580inline PageTextIterator PageTextIterator::operator+(difference_type offset) const {
581 if (offset == 0)
582 return *this;
583
584 PageTextIterator tmp(*this);
585 tmp += offset;
586 return tmp;
587}
588
589inline PageTextIterator& PageTextIterator::operator-=(difference_type offset) {
590 if (offset == 0)
591 return *this;
592
593 PDF_CHECK(m_charindex - offset >= 0, kPDErrBadParam, "Iterator is not decrementable");
594 m_charindex -= offset;
595 return *this;
596}
597
598inline PageTextIterator PageTextIterator::operator-(difference_type offset) const {
599 if (offset == 0)
600 return *this;
601
602 PageTextIterator tmp(*this);
603 tmp -= offset;
604 return tmp;
605}
606
607inline bool PageTextIterator::Equals(const PageTextIterator& rhs) const {
608 return m_pagetext == rhs.m_pagetext && m_charindex == rhs.m_charindex;
609}
610
611inline bool PageTextIterator::operator==(const PageTextIterator& rhs) const {
612 return Equals(rhs);
613}
614
615inline bool PageTextIterator::operator!=(const PageTextIterator& rhs) const {
616 return !Equals(rhs);
617}
618
619inline bool PageTextIterator::LessThen(const PageTextIterator& rhs) const {
620 PDF_CHECK(m_pagetext == rhs.m_pagetext, kPDErrBadParam, "Iterators are not comparable - different pages");
621 return m_charindex < rhs.m_charindex;
622}
623
624inline bool PageTextIterator::operator<(const PageTextIterator& rhs) const {
625 return LessThen(rhs);
626}
627
628inline bool PageTextIterator::operator<=(const PageTextIterator& rhs) const {
629 return LessThen(rhs) || Equals(rhs);
630}
631
632inline bool PageTextIterator::operator>(const PageTextIterator& rhs) const {
633 return !LessThen(rhs) && !Equals(rhs);
634}
635
636inline bool PageTextIterator::operator>=(const PageTextIterator& rhs) const {
637 return !LessThen(rhs);
638}
639
641 return PageTextIterator(m_pagetext, m_charindex);
642}
643
645 return PageTextIterator(m_pagetext, m_numchars);
646}
647
649 return m_pagetext;
650}
651
652inline size_t PageTextIterator::GetCharIndex() const {
653 return m_charindex;
654}
655
657 return m_pagetext.GetChar(m_charindex);
658}
659
660inline PageTextChar PageTextIterator::operator*() const {
661 // const reference lifetime extension
662 return m_pagetext.GetChar(m_charindex);
663}
664
665} // namespace PDF
666
667#endif // PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
Represents a character in a PageText.
Definition page_text.h:83
PDFontInfo GetFontInfo() const
Get the font information of a character. This includes the font family, style and format.
Definition page_text.h:285
std::wstring GetUnicode() const
Get the Unicode representation of a character.
Definition page_text.h:275
PDColorValue GetFontColor() const
Get the font color of a character.
Definition page_text.h:297
float GetRotate() const
Get the rotation angle of a character within a text element.
Definition page_text.h:279
float GetFontSize() const
Get the font size of a character.
Definition page_text.h:291
PDF::Quad GetQuad() const
Get the quad (bounding box) of a character.
Definition page_text.h:269
Represents text on a page.
Definition page_text.h:42
std::vector< PageTextRange > Search(const std::wstring &text, PageTextSearchFlags flags=0) const
Search for the specified text throughout all text within a page.
Definition page_text.h:176
PageTextChar GetChar(size_t charindex) const
Get the character at the specified index.
Definition page_text.h:163
size_t HitTest(const PDF::PointF &point) const
Perform a hit test at the specified point and return the index of the character that contains the poi...
Definition page_text.h:167
size_t GetNumChars() const
Get the number of characters from a page, text segment or a word.
Definition page_text.h:157
An iterator for iterating over the characters in a PageText object.
Definition page_text.h:306
bool operator>(const PageTextIterator &rhs) const
Greater than operator.
Definition page_text.h:632
PageText GetPageText() const
Get the PageText object.
Definition page_text.h:648
PageTextIterator operator-(difference_type offset) const
Subtraction operator.
Definition page_text.h:598
PageTextIterator & operator=(const PageTextIterator &rhs)
Assignment operator.
Definition page_text.h:539
bool operator<=(const PageTextIterator &rhs) const
Less than or equal to operator.
Definition page_text.h:628
bool operator!=(const PageTextIterator &rhs) const
Inequality operator.
Definition page_text.h:615
PageTextIterator & operator-=(difference_type offset)
Subtraction assignment operator.
Definition page_text.h:589
bool operator==(const PageTextIterator &rhs) const
Equality operator.
Definition page_text.h:611
PageTextIterator & Prev()
Move the iterator to the previous character.
Definition page_text.h:535
PageTextIterator & Next()
Move the iterator to the next character.
Definition page_text.h:531
PageTextIterator()
Default constructor.
Definition page_text.h:507
bool operator<(const PageTextIterator &rhs) const
Less than operator.
Definition page_text.h:624
PageTextIterator & operator--()
Pre-decrement operator.
Definition page_text.h:558
PageTextIterator operator+(difference_type offset) const
Addition operator.
Definition page_text.h:580
bool Equals(const PageTextIterator &rhs) const
Equality comparison.
Definition page_text.h:607
PageTextIterator & Advance(int offset)
Advance the iterator by the specified offset.
Definition page_text.h:522
PageTextIterator end() const
Get the ending iterator.
Definition page_text.h:644
size_t GetCharIndex() const
Get the character index.
Definition page_text.h:652
bool operator>=(const PageTextIterator &rhs) const
Greater than or equal to operator.
Definition page_text.h:636
PageTextChar GetChar() const
Get the character at the current iterator position.
Definition page_text.h:656
PageTextIterator & operator++()
Pre-increment operator.
Definition page_text.h:546
PageTextIterator begin() const
Get the beginning iterator.
Definition page_text.h:640
PageTextIterator & operator+=(difference_type offset)
Addition assignment operator.
Definition page_text.h:570
bool LessThen(const PageTextIterator &rhs) const
Less than comparison.
Definition page_text.h:619
@ kPDErrBadParam
Bad input parameter.
Definition errors.h:20
@ kPDErrOutOfRange
Value out of range.
Definition errors.h:22
Represents a range of text on a page.
Definition page_text.h:29
size_t endindex
Definition page_text.h:31
size_t begindex
Definition page_text.h:30
Definition math.h:132
Definition math.h:838
Definition fonts.h:25