>>4658
Itchy, got to finish what I started.
language: c++
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <podofo/podofo.h>
#include <string_view>
#include <map>
#include <unordered_map>
void print_help()
{
printf("Usage: podofotxtextract [inputfile] [outputfile]\n\n");
}
PoDoFo::PdfObject *deref_object (PoDoFo::PdfMemDocument &document, PoDoFo::PdfObject *pdfobj) {
if (pdfobj->IsReference()) {
return deref_object(document, document.GetObjects().GetObject(pdfobj->GetReference()));
}
else {
return pdfobj;
}
return pdfobj;
}
int64_t get_number(PoDoFo::PdfMemDocument &document, PoDoFo::PdfObject *pdfobj) {
return deref_object(document, pdfobj)->GetNumber();
}
inline PoDoFo::PdfObject *obj_dir_get (PoDoFo::PdfObject *obj,
const std::string key) {
return obj->GetDictionary().GetKey(PoDoFo::PdfName(key));
}
class image_obj {
public:
int64_t height;
int64_t width;
int64_t length;
std::size_t data_hash;
PoDoFo::PdfReference ref;
image_obj(PoDoFo::PdfMemDocument &document, PoDoFo::PdfObject &pdfobj) {
height = get_number(document, obj_dir_get(&pdfobj, "Height"));
width = get_number(document, obj_dir_get(&pdfobj, "Width"));
length = get_number(document, obj_dir_get(&pdfobj, "Length"));
char* pBuffer;
PoDoFo::pdf_long lLen;
pdfobj.GetStream()->GetFilteredCopy( &pBuffer, &lLen );
data_hash = std::hash<std::string_view>()(std::string_view(pBuffer, lLen));
ref = pdfobj.Reference();
}
};
bool operator==(const image_obj& img1, const image_obj& img2) {
return img1.height == img2.height &&
img1.width == img2.width &&
img1.length == img2.length &&
img1.data_hash == img2.data_hash;
}
template<> struct std::hash<image_obj> {
std::size_t operator()(const image_obj& img) const noexcept {
return img.data_hash;
}
};
template<> struct std::hash<PoDoFo::PdfReference> {
std::size_t operator()(const PoDoFo::PdfReference& ref) const noexcept {
return ref.ObjectNumber();
}
};
int is_xobj_of (PoDoFo::PdfObject *obj, const std::string &subtype) {
if (obj->IsDictionary()) {
PoDoFo::PdfObject* pObjType = obj->GetDictionary().GetKey( PoDoFo::PdfName::KeyType );
PoDoFo::PdfObject* pObjSubType = obj->GetDictionary().GetKey( PoDoFo::PdfName::KeySubtype );
if ((pObjType && pObjType->IsName() && ( pObjType->GetName().GetName() == "XObject")) &&
(pObjSubType && pObjSubType->IsName() && (pObjSubType->GetName().GetName() == subtype))) {
return 1;
}
}
return 0;
}
int is_image (PoDoFo::PdfObject *obj) {
return is_xobj_of(obj, "Image");
}
int is_form (PoDoFo::PdfObject *obj) {
return is_xobj_of(obj, "Form");
}
void dedup_imgs (PoDoFo::PdfMemDocument &document,
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> &imgrefs) {
std::unordered_map<image_obj, PoDoFo::PdfReference> images;
PoDoFo::TCIVecObjects it = document.GetObjects().begin();
while (it != document.GetObjects().end()) {
if (is_image(*it)) {
image_obj image(document, **it);
if (images.count(image)) {
imgrefs.insert({image.ref, images[image]});
}
else {
images.insert({image, image.ref});
}
document.FreeObjectMemory(*it);
}
++it;
}
}
void check_replace_ref (PoDoFo::PdfObject *xobj_dict,
const std::pair<const PoDoFo::PdfName,PoDoFo::PdfObject*> &to_check,
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> &imgrefs) {
PoDoFo::PdfObject *obj = to_check.second;
if (obj->IsReference()) {
PoDoFo::PdfReference imgref = obj->GetReference();
if (imgrefs.count(imgref)) {
PoDoFo::PdfReference fromref = imgrefs.at(imgref);
xobj_dict->GetDictionary().AddKey(to_check.first, PoDoFo::PdfObject(fromref));
}
}
}
void dedup_xobjs (PoDoFo::PdfMemDocument &document,
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> &imgrefs) {
PoDoFo::TCIVecObjects it = document.GetObjects().begin();
while (it != document.GetObjects().end()) {
if (is_form(*it)) {
PoDoFo::PdfObject *resources = obj_dir_get(*it, "Resources");
if (resources) {
resources = deref_object(document, resources);
PoDoFo::PdfObject *xobj = obj_dir_get(resources, "XObject");
if (xobj) {
auto xobj_it = xobj->GetDictionary().begin();
while (xobj_it != xobj->GetDictionary().end()) {
check_replace_ref(xobj, *xobj_it, imgrefs);
++xobj_it;
}
}}
document.FreeObjectMemory(*it);
}
else if (is_image(*it)) {
PoDoFo::PdfObject *mask = obj_dir_get(*it, "Mask");
PoDoFo::PdfObject *smask = obj_dir_get(*it, "SMask");
if (mask && mask->IsReference()) {
check_replace_ref(*it, {PoDoFo::PdfName("Mask"), mask}, imgrefs);
}
if (smask && smask->IsReference()) {
check_replace_ref(*it, {PoDoFo::PdfName("SMask"), smask}, imgrefs);
}
document.FreeObjectMemory(*it);
}
++it;
}
}
void dedup_pages (PoDoFo::PdfMemDocument &document,
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> &imgrefs) {
int nCount = document.GetPageCount();
for (int i=0; i<nCount; i++) {
PoDoFo::PdfPage* pPage = document.GetPage(i);
PoDoFo::PdfObject *xobj = obj_dir_get(pPage->GetResources(), "XObject");
if (!xobj)
continue;
auto xobj_it = xobj->GetDictionary().begin();
while (xobj_it != xobj->GetDictionary().end()) {
check_replace_ref(xobj, *xobj_it, imgrefs);
++xobj_it;
}
}
}
void dedup_imgrefs (PoDoFo::PdfMemDocument &document,
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> &imgrefs) {
dedup_pages(document, imgrefs);
dedup_xobjs(document, imgrefs);
}
void delete_img_obj (PoDoFo::PdfMemDocument &document,
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> &imgrefs) {
for (auto rit = imgrefs.begin(); rit != imgrefs.end(); ++rit) {
document.GetObjects().RemoveObject(rit->first);
}
}
int main( int argc, char* argv[] )
{
char* pszInput;
char* pszOutput;
if( argc != 3 )
{
print_help();
exit( -1 );
}
pszInput = argv[1];
pszOutput = argv[2];
try {
std::unordered_map<PoDoFo::PdfReference, PoDoFo::PdfReference> imgrefs;
PoDoFo::PdfMemDocument document(pszInput);
dedup_imgs(document, imgrefs);
dedup_imgrefs(document, imgrefs);
delete_img_obj(document, imgrefs);
PoDoFo::PdfObject trailer(*(document.GetTrailer()));
PoDoFo::PdfWriter writer( &(document.GetObjects()), &trailer);
writer.SetWriteMode(PoDoFo::ePdfWriteMode_Clean);
writer.Write(pszOutput);
} catch( PoDoFo::PdfError & e ) {
fprintf( stderr, "Error: An error %i ocurred during processing the pdf file.\n", e.GetError() );
e.PrintErrorMsg();
return e.GetError();
}
return 0;
}