Skip to content

feature: Delayed background rendering #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
May 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pdf2htmlEX/src/BackgroundRenderer/CairoBackgroundRenderer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,9 @@ bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
if (doc->getPageRotate(pageno) == 90 || doc->getPageRotate(pageno) == 270)
std::swap(page_height, page_width);

string fn = (char*)html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno);
if(param.embed_image)
html_renderer->tmp_files.add(fn);
auto fn = html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno);

surface = cairo_svg_surface_create(fn.c_str(), page_width * param.actual_dpi / DEFAULT_DPI, page_height * param.actual_dpi / DEFAULT_DPI);
surface = cairo_svg_surface_create((const char *)fn, page_width * param.actual_dpi / DEFAULT_DPI, page_height * param.actual_dpi / DEFAULT_DPI);
cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2);
cairo_surface_set_fallback_resolution(surface, param.actual_dpi, param.actual_dpi);

Expand Down Expand Up @@ -174,15 +172,14 @@ bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
{
int n = 0;
char c;
ifstream svgfile(fn);
ifstream svgfile((const char *)fn);
//count of '<' in the file should be an approximation of node count.
while(svgfile >> c)
{
if (c == '<')
++n;
if (n > param.svg_node_count_limit)
{
html_renderer->tmp_files.add(fn);
return false;
}
}
Expand All @@ -192,6 +189,9 @@ bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
for (auto id : bitmaps_in_current_page)
++bitmaps_ref_count[id];

if(param.embed_image)
html_renderer->tmp_files.add((const char *)fn);

return true;
}

Expand Down
95 changes: 42 additions & 53 deletions pdf2htmlEX/src/BackgroundRenderer/SplashBackgroundRenderer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,66 +111,55 @@ bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
(!(param.use_cropbox)),
false, false,
nullptr, nullptr, &annot_cb, &process_annotation);

auto * bitmap = getBitmap();

auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str());

SplashImageFileFormat splashImageFileFormat;
if(format == "png")
splashImageFileFormat = splashFormatPng;
else if(format == "jpg")
splashImageFileFormat = splashFormatJpeg;
else
throw string("Image format not supported: ") + format;

SplashError e = bitmap->writeImgFile(splashImageFileFormat, (const char *)fn, param.actual_dpi, param.actual_dpi);
if (e != splashOk)
throw string("Cannot write background image. SplashErrorCode: ") + std::to_string(e);

if(param.embed_image)
html_renderer->tmp_files.add((const char *)fn);

return true;
}

void SplashBackgroundRenderer::embed_image(int pageno)
{
auto * bitmap = getBitmap();
// dump the background image only when it is not empty
if(bitmap->getWidth() >= 0 && bitmap->getHeight() >= 0)
auto & f_page = *(html_renderer->f_curpage);

f_page << "<img class=\"" << CSS::FULL_BACKGROUND_IMAGE_CN
<< "\" alt=\"\" src=\"";

if(param.embed_image)
{
auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str());
ifstream fin((char*)path, ifstream::binary);
if(!fin)
throw string("Cannot read background image ") + (char*)path;

auto iter = FORMAT_MIME_TYPE_MAP.find(format);
if(iter == FORMAT_MIME_TYPE_MAP.end())
throw string("Image format not supported: ") + format;

string mime_type = iter->second;
f_page << "data:" << mime_type << ";base64," << Base64Stream(fin);
}
else
{
{
auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str());
if(param.embed_image)
html_renderer->tmp_files.add((const char *)fn);

SplashImageFileFormat splashImageFileFormat;
if(format == "png")
splashImageFileFormat = splashFormatPng;
else if(format == "jpg")
splashImageFileFormat = splashFormatJpeg;
else
throw string("Image format not supported: ") + format;

SplashError e = bitmap->writeImgFile(splashImageFileFormat, (const char *)fn, param.actual_dpi, param.actual_dpi);
if (e != splashOk)
throw string("Cannot write background image. SplashErrorCode: ") + std::to_string(e);
}

double h_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.actual_dpi;
double v_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.actual_dpi;

auto & f_page = *(html_renderer->f_curpage);
auto & all_manager = html_renderer->all_manager;

f_page << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN
<< " " << CSS::LEFT_CN << all_manager.left.install(0.0L)
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(0.0L)
<< " " << CSS::WIDTH_CN << all_manager.width.install(h_scale * bitmap->getWidth())
<< " " << CSS::HEIGHT_CN << all_manager.height.install(v_scale * bitmap->getHeight())
<< "\" alt=\"\" src=\"";

if(param.embed_image)
{
auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str());
ifstream fin((char*)path, ifstream::binary);
if(!fin)
throw string("Cannot read background image ") + (char*)path;

auto iter = FORMAT_MIME_TYPE_MAP.find(format);
if(iter == FORMAT_MIME_TYPE_MAP.end())
throw string("Image format not supported: ") + format;

string mime_type = iter->second;
f_page << "data:" << mime_type << ";base64," << Base64Stream(fin);
}
else
{
f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str());
}
f_page << "\"/>";
f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str());
}
f_page << "\"/>";
}

} // namespace pdf2htmlEX
18 changes: 11 additions & 7 deletions pdf2htmlEX/src/CoveredTextDetector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

namespace pdf2htmlEX {

CoveredTextDetector::CoveredTextDetector(Param & param): param(param)
CoveredTextDetector::CoveredTextDetector()
{
}

CoveredTextDetector::CoveredTextDetector(Param & param): param(&param)
{
}

Expand All @@ -41,10 +45,10 @@ void CoveredTextDetector::add_char_bbox_clipped(cairo_t *cairo, double * bbox, i
char_pts_visible.push_back(pts_visible);

// DCRH: Hide if no points are visible, or if some points are visible and correct_text_visibility == 2
if (pts_visible == 0 || param.correct_text_visibility == 2) {
if (pts_visible == 0 || param->correct_text_visibility == 2) {
chars_covered.push_back(true);
if (pts_visible > 0 && param.correct_text_visibility == 2) {
param.actual_dpi = std::min(param.text_dpi, param.max_dpi); // Char partially covered so increase background resolution
if (pts_visible > 0 && param->correct_text_visibility == 2) {
param->actual_dpi = std::min(param->text_dpi, param->max_dpi); // Char partially covered so increase background resolution
}
} else {
chars_covered.push_back(false);
Expand Down Expand Up @@ -98,13 +102,13 @@ printf("pts_visible=%x\n", pts_visible);
printf("pts_visible=%x\n", pts_visible);
#endif
char_pts_visible[i] = pts_visible;
if (pts_visible == 0 || (pts_visible != (1|2|4|8) && param.correct_text_visibility == 2)) {
if (pts_visible == 0 || (pts_visible != (1|2|4|8) && param->correct_text_visibility == 2)) {
#ifdef DEBUG
printf("Char covered\n");
#endif
chars_covered[i] = true;
if (pts_visible > 0 && param.correct_text_visibility == 2) { // Partially visible text => increase rendering DPI
param.actual_dpi = std::min(param.text_dpi, param.max_dpi);
if (pts_visible > 0 && param->correct_text_visibility == 2) { // Partially visible text => increase rendering DPI
param->actual_dpi = std::min(param->text_dpi, param->max_dpi);
}
}
} else {
Expand Down
3 changes: 2 additions & 1 deletion pdf2htmlEX/src/CoveredTextDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ namespace pdf2htmlEX {
class CoveredTextDetector
{
public:
CoveredTextDetector();

CoveredTextDetector(Param & param);

Expand Down Expand Up @@ -60,7 +61,7 @@ class CoveredTextDetector
// x00, y00, x01, y01; x10, y10, x11, y11;...
std::vector<double> char_bboxes;
std::vector<int> char_pts_visible;
Param & param;
Param * param;
};

}
Expand Down
7 changes: 7 additions & 0 deletions pdf2htmlEX/src/HTMLRenderer/HTMLRenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ struct HTMLRenderer : OutputDev

void process(PDFDoc * doc);

bool renderPage(PDFDoc * doc, int pageno);

////////////////////////////////////////////////////
// OutputDev interface
////////////////////////////////////////////////////
Expand Down Expand Up @@ -379,6 +381,11 @@ struct HTMLRenderer : OutputDev

CoveredTextDetector covered_text_detector;
DrawingTracer tracer;

struct PageCache {
CoveredTextDetector covered_text_detector;
};
std::unordered_map<int, PageCache> page_cache;
};

} //namespace pdf2htmlEX
Expand Down
57 changes: 51 additions & 6 deletions pdf2htmlEX/src/HTMLRenderer/general.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,44 @@ void HTMLRenderer::process(PDFDoc *doc)

post_process();

bg_renderer = nullptr;
fallback_bg_renderer = nullptr;
if (param.delay_background == 0)
{
bg_renderer = nullptr;
fallback_bg_renderer = nullptr;
}

if(param.quiet == 0)
cerr << endl;
}

bool HTMLRenderer::renderPage(PDFDoc *doc, int pageno)
{
if (param.delay_background == 0)
{
return false;
}

if (page_cache.find(pageno) == page_cache.end())
{
cerr << "Page number " << pageno << " not found in page cache" << endl;
return false;
}

covered_text_detector = page_cache[pageno].covered_text_detector;

if (bg_renderer->render_page(cur_doc, pageno))
{
return true;
}
else if (fallback_bg_renderer)
{
if (fallback_bg_renderer->render_page(cur_doc, pageno))
return true;
}

return false;
}

void HTMLRenderer::setDefaultCTM(const double *ctm)
{
memcpy(default_ctm, ctm, sizeof(default_ctm));
Expand Down Expand Up @@ -243,14 +274,21 @@ void HTMLRenderer::endPage() {

if(param.process_nontext)
{
if (bg_renderer->render_page(cur_doc, pageNum))
if (param.delay_background)
{
bg_renderer->embed_image(pageNum);
}
else if (fallback_bg_renderer)
else
{
if (fallback_bg_renderer->render_page(cur_doc, pageNum))
fallback_bg_renderer->embed_image(pageNum);
if (bg_renderer->render_page(cur_doc, pageNum))
{
bg_renderer->embed_image(pageNum);
}
else if (fallback_bg_renderer)
{
if (fallback_bg_renderer->render_page(cur_doc, pageNum))
fallback_bg_renderer->embed_image(pageNum);
}
}
}

Expand Down Expand Up @@ -294,6 +332,13 @@ void HTMLRenderer::endPage() {
{
f_pages.fs << "</div>" << endl;
}

if (param.delay_background)
{
page_cache[this->pageNum] = {
.covered_text_detector = covered_text_detector,
};
}
}

void HTMLRenderer::pre_process(PDFDoc * doc)
Expand Down
1 change: 1 addition & 0 deletions pdf2htmlEX/src/Param.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct Param
int printing;
int fallback;
int tmp_file_size_limit;
int delay_background;

// fonts
int embed_external_font;
Expand Down
Loading