Skip to content

Commit fe04fd3

Browse files
author
yu
committed
not add space character for CJK.
1 parent 2a40945 commit fe04fd3

File tree

3 files changed

+32
-3
lines changed

3 files changed

+32
-3
lines changed

qt/src/Utils.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,19 @@ QString Utils::getSpellingLanguage(const QString& lang) {
221221
}
222222
return syslang;
223223
}
224+
225+
// Unicode blocks http://www.fileformat.info/info/unicode/block/index.htm
226+
bool Utils::spacedWord(const QString& text, bool prevWord) {
227+
short unicode = (prevWord ? text.back() : text.front()).unicode();
228+
// CJK Word
229+
std::vector<std::pair<int, int>> cjkWordRange{{0x2480, 0x303f}, {0x31c0, 0x9fff}
230+
, {0xf900, 0xfaff}, {0xfe30, 0xfe4f}, {0x20000, 0x2fa1f}};
231+
for(int i = 0; i < cjkWordRange.size(); i++) {
232+
if(unicode < cjkWordRange[i].first) {
233+
return true;
234+
} else if(unicode >= cjkWordRange[i].first && unicode <= cjkWordRange[i].second) {
235+
return false;
236+
}
237+
}
238+
return true;
239+
}

qt/src/Utils.hh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ QByteArray download(QUrl url, QString& messages, int timeout = 60000);
4747

4848
QString getSpellingLanguage(const QString& lang = QString());
4949

50+
bool spacedWord(const QString& text, bool prevWord);
51+
5052
template<typename T>
5153
class AsyncQueue {
5254
public:

qt/src/hocr/HOCRPdfExporter.cc

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,8 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c
688688
QString itemClass = item->itemClass();
689689
QRect itemRect = item->bbox();
690690
int childCount = item->children().size();
691+
bool prevSpacedWord, currentSpacedWord;
692+
prevSpacedWord = currentSpacedWord = false;
691693
if(itemClass == "ocr_par" && pdfSettings.uniformizeLineSpacing) {
692694
double yInc = double(itemRect.height()) / childCount;
693695
double y = itemRect.top() + yInc;
@@ -706,15 +708,24 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c
706708
if(pdfSettings.fontSize == -1) {
707709
painter.setFontSize(wordItem->fontSize() * pdfSettings.detectedFontScaling);
708710
}
711+
712+
prevWordRight = wordRect.right();
713+
QString text = wordItem->text();
714+
currentSpacedWord = Utils::spacedWord(text, false);
709715
// If distance from previous word is large, keep the space
710716
if(wordRect.x() - prevWordRight > pdfSettings.preserveSpaceWidth * painter.getAverageCharWidth() / px2pu) {
711717
x = wordRect.x();
718+
} else {
719+
//need space
720+
if(currentSpacedWord && prevSpacedWord ) {
721+
x += painter.getTextWidth(" ") / px2pu;
722+
}
712723
}
713-
prevWordRight = wordRect.right();
714-
QString text = wordItem->text();
724+
715725
double wordBaseline = (x - itemRect.x()) * baseline.first + baseline.second;
716726
painter.drawText(x * px2pu, (y + wordBaseline) * px2pu, text);
717-
x += painter.getTextWidth(text + " ") / px2pu;
727+
x += painter.getTextWidth(text) / px2pu;
728+
prevSpacedWord = Utils::spacedWord(text, true);
718729
}
719730
}
720731
} else if(itemClass == "ocr_line" && !pdfSettings.uniformizeLineSpacing) {

0 commit comments

Comments
 (0)