Skip to content

Commit 54ba91f

Browse files
authored
Merge pull request #339 from openzim/metadata_table
2 parents 3dd2f59 + d32037c commit 54ba91f

File tree

15 files changed

+694
-77
lines changed

15 files changed

+694
-77
lines changed

src/metadata.cpp

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
/*
2+
* Copyright 2023 Veloman Yunkan <veloman.yunkan@gmail.com>
3+
*
4+
* This program is free software; you can redistribute it and/or modify
5+
* it under the terms of the GNU General Public License as published by
6+
* the Free Software Foundation; either version 3 of the License, or
7+
* any later version.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with this program; if not, write to the Free Software
16+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17+
* MA 02110-1301, USA.
18+
*/
19+
20+
#include "metadata.h"
21+
22+
#include <sstream>
23+
#include <regex>
24+
#include <unicode/unistr.h>
25+
26+
#include <cctype>
27+
#include <iomanip>
28+
29+
30+
namespace zim
31+
{
32+
33+
namespace
34+
{
35+
36+
const bool MANDATORY = true;
37+
const bool OPTIONAL = false;
38+
39+
const std::string LANGS_REGEXP = "\\w{3}(,\\w{3})*";
40+
const std::string DATE_REGEXP = R"(\d\d\d\d-\d\d-\d\d)";
41+
42+
// PNG regexp has to be defined in such a tricky way because it includes
43+
// a NUL character
44+
const char PNG_REGEXP_DATA[] = "^\x89\x50\x4e\x47\x0d\x0a\x1a\x0a(.|\\s|\0)+";
45+
const std::string PNG_REGEXP(PNG_REGEXP_DATA, sizeof(PNG_REGEXP_DATA)-1);
46+
47+
bool matchRegex(const std::string& regexStr, const std::string& text)
48+
{
49+
const std::regex regex(regexStr);
50+
return std::regex_match(text.begin(), text.end(), regex);
51+
}
52+
53+
size_t getTextLength(const std::string& utf8EncodedString)
54+
{
55+
return icu::UnicodeString::fromUTF8(utf8EncodedString).length();
56+
}
57+
58+
class MetadataComplexCheckBase
59+
{
60+
public:
61+
const std::string description;
62+
const MetadataComplexCheckBase* const prev;
63+
64+
public: // functions
65+
explicit MetadataComplexCheckBase(const std::string& desc);
66+
67+
MetadataComplexCheckBase(const MetadataComplexCheckBase&) = delete;
68+
MetadataComplexCheckBase(MetadataComplexCheckBase&&) = delete;
69+
void operator=(const MetadataComplexCheckBase&) = delete;
70+
void operator=(MetadataComplexCheckBase&&) = delete;
71+
72+
virtual ~MetadataComplexCheckBase();
73+
74+
virtual bool checkMetadata(const Metadata& m) const = 0;
75+
76+
static const MetadataComplexCheckBase* getLastCheck() { return last; }
77+
78+
private: // functions
79+
static const MetadataComplexCheckBase* last;
80+
};
81+
82+
const MetadataComplexCheckBase* MetadataComplexCheckBase::last = nullptr;
83+
84+
MetadataComplexCheckBase::MetadataComplexCheckBase(const std::string& desc)
85+
: description(desc)
86+
, prev(last)
87+
{
88+
last = this;
89+
}
90+
91+
MetadataComplexCheckBase::~MetadataComplexCheckBase()
92+
{
93+
// Ideally, we should de-register this object from the list of live objects.
94+
// However, in the current implementation MetadataComplexCheckBase objects
95+
// are only constructed in static storage and the list of active objects
96+
// isn't supposed to be accessed after any MetadataComplexCheckBase object
97+
// has been destroyed as part of program termination clean-up actions.
98+
}
99+
100+
#define ADD_METADATA_COMPLEX_CHECK(DESC, CLSNAME) \
101+
class CLSNAME : public MetadataComplexCheckBase \
102+
{ \
103+
public: \
104+
CLSNAME() : MetadataComplexCheckBase(DESC) {} \
105+
bool checkMetadata(const Metadata& data) const override; \
106+
}; \
107+
\
108+
const CLSNAME CONCAT(obj, CLSNAME); \
109+
\
110+
bool CLSNAME::checkMetadata(const Metadata& data) const \
111+
/* should be followed by the check body */
112+
113+
114+
115+
#define CONCAT(X, Y) X##Y
116+
#define GENCLSNAME(UUID) CONCAT(MetadataComplexCheck, UUID)
117+
118+
#define METADATA_ASSERT(DESC) ADD_METADATA_COMPLEX_CHECK(DESC, GENCLSNAME(__LINE__))
119+
120+
121+
#include "metadata_constraints.cpp"
122+
123+
// This function is intended for pretty printing of regexps with non-printable
124+
// characters.
125+
// In a general purpose/rigorous version we should escape the escape symbol
126+
// (backslash) too, but that doesn't play well with the purpose stated above.
127+
std::string escapeNonPrintableChars(const std::string& s)
128+
{
129+
std::ostringstream os;
130+
os << std::hex;
131+
for (const char c : s) {
132+
if (std::isprint(c)) {
133+
os << c;
134+
} else {
135+
const unsigned int charVal = static_cast<unsigned char>(c);
136+
os << "\\x" << std::setw(2) << std::setfill('0') << charVal;
137+
}
138+
}
139+
return os.str();
140+
}
141+
142+
Metadata::Errors concat(Metadata::Errors e1, const Metadata::Errors& e2)
143+
{
144+
e1.insert(e1.end(), e2.begin(), e2.end());
145+
return e1;
146+
}
147+
148+
} // unnamed namespace
149+
150+
const Metadata::ReservedMetadataTable& Metadata::reservedMetadataInfo = reservedMetadataInfoTable;
151+
152+
const Metadata::ReservedMetadataRecord&
153+
Metadata::getReservedMetadataRecord(const std::string& name)
154+
{
155+
for ( const auto& x : reservedMetadataInfo ) {
156+
if ( x.name == name )
157+
return x;
158+
}
159+
160+
throw std::out_of_range(name + " is not a reserved metadata name");
161+
}
162+
163+
bool Metadata::has(const std::string& name) const
164+
{
165+
return data.find(name) != data.end();
166+
}
167+
168+
const std::string& Metadata::operator[](const std::string& name) const
169+
{
170+
return data.at(name);
171+
}
172+
173+
void Metadata::set(const std::string& name, const std::string& value)
174+
{
175+
data[name] = value;
176+
}
177+
178+
bool Metadata::valid() const
179+
{
180+
return check().empty();
181+
}
182+
183+
Metadata::Errors Metadata::checkMandatoryMetadata() const
184+
{
185+
Errors errors;
186+
for ( const auto& rmr : reservedMetadataInfo ) {
187+
if ( rmr.isMandatory && data.find(rmr.name) == data.end() ) {
188+
errors.push_back("Missing mandatory metadata: " + rmr.name );
189+
}
190+
}
191+
192+
return errors;
193+
}
194+
195+
Metadata::Errors Metadata::checkSimpleConstraints() const
196+
{
197+
Errors errors;
198+
for ( const auto& nv : data ) {
199+
const auto& name = nv.first;
200+
const auto& value = nv.second;
201+
try {
202+
const auto& rmr = getReservedMetadataRecord(name);
203+
if ( rmr.minLength != 0 && getTextLength(value) < rmr.minLength ) {
204+
std::ostringstream oss;
205+
oss << name << " must contain at least " << rmr.minLength << " characters";
206+
errors.push_back(oss.str());
207+
}
208+
if ( rmr.maxLength != 0 && getTextLength(value) > rmr.maxLength ) {
209+
std::ostringstream oss;
210+
oss << name << " must contain at most " << rmr.maxLength << " characters";
211+
errors.push_back(oss.str());
212+
}
213+
if ( !rmr.regex.empty() && !matchRegex(rmr.regex, value) ) {
214+
const std::string regex = escapeNonPrintableChars(rmr.regex);
215+
errors.push_back(name + " doesn't match regex: " + regex);
216+
}
217+
} catch ( const std::out_of_range& ) {
218+
// ignore non-reserved metadata
219+
}
220+
}
221+
return errors;
222+
}
223+
224+
Metadata::Errors Metadata::checkComplexConstraints() const
225+
{
226+
Errors errors;
227+
const MetadataComplexCheckBase* c = MetadataComplexCheckBase::getLastCheck();
228+
for ( ; c != nullptr ; c = c->prev ) {
229+
if ( ! c->checkMetadata(*this) ) {
230+
errors.push_back(c->description);
231+
}
232+
}
233+
return errors;
234+
}
235+
236+
Metadata::Errors Metadata::check() const
237+
{
238+
const Errors e1 = checkMandatoryMetadata();
239+
const Errors e2 = checkSimpleConstraints();
240+
if ( !e1.empty() || !e2.empty() )
241+
return concat(e1, e2);
242+
243+
return checkComplexConstraints();
244+
}
245+
246+
} // namespace zim

src/metadata.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Copyright 2023 Veloman Yunkan <veloman.yunkan@gmail.com>
3+
*
4+
* This program is free software; you can redistribute it and/or modify
5+
* it under the terms of the GNU General Public License as published by
6+
* the Free Software Foundation; either version 3 of the License, or
7+
* any later version.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with this program; if not, write to the Free Software
16+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17+
* MA 02110-1301, USA.
18+
*/
19+
20+
#ifndef OPENZIM_METADATA_H
21+
#define OPENZIM_METADATA_H
22+
23+
#include <string>
24+
#include <vector>
25+
#include <map>
26+
27+
namespace zim
28+
{
29+
30+
class Metadata
31+
{
32+
typedef std::map<std::string, std::string> KeyValueMap;
33+
34+
public: // types
35+
struct ReservedMetadataRecord
36+
{
37+
const std::string name;
38+
const bool isMandatory;
39+
const size_t minLength;
40+
const size_t maxLength;
41+
const std::string regex;
42+
};
43+
44+
typedef std::vector<ReservedMetadataRecord> ReservedMetadataTable;
45+
46+
typedef std::vector<std::string> Errors;
47+
48+
typedef KeyValueMap::const_iterator Iterator;
49+
50+
public: // data
51+
static const ReservedMetadataTable& reservedMetadataInfo;
52+
53+
public: // functions
54+
void set(const std::string& name, const std::string& value);
55+
bool has(const std::string& name) const;
56+
const std::string& operator[](const std::string& name) const;
57+
58+
bool valid() const;
59+
Errors check() const;
60+
61+
static const ReservedMetadataRecord& getReservedMetadataRecord(const std::string& name);
62+
63+
Iterator begin() const { return data.begin(); }
64+
Iterator end() const { return data.end(); }
65+
66+
private: // functions
67+
Errors checkMandatoryMetadata() const;
68+
Errors checkSimpleConstraints() const;
69+
Errors checkComplexConstraints() const;
70+
71+
private: // data
72+
KeyValueMap data;
73+
};
74+
75+
} // namespace zim
76+
77+
#endif // OPENZIM_METADATA_H

src/metadata_constraints.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
const Metadata::ReservedMetadataTable reservedMetadataInfoTable = {
2+
// name isMandatory minLength maxLength regex
3+
{ "Name", MANDATORY, 1, 0, "" },
4+
{ "Title", MANDATORY, 1, 30, "" },
5+
{ "Language", MANDATORY, 3, 0, LANGS_REGEXP },
6+
{ "Creator", MANDATORY, 1, 0, "" },
7+
{ "Publisher", MANDATORY, 1, 0, "" },
8+
{ "Date", MANDATORY, 10, 10, DATE_REGEXP },
9+
{ "Description", MANDATORY, 1, 80, "" },
10+
{ "LongDescription", OPTIONAL, 0, 4000, "" },
11+
{ "License", OPTIONAL, 0, 0, "" },
12+
{ "Tags", OPTIONAL, 0, 0, "" },
13+
{ "Relation", OPTIONAL, 0, 0, "" },
14+
{ "Flavour", OPTIONAL, 0, 0, "" },
15+
{ "Source", OPTIONAL, 0, 0, "" },
16+
{ "Counter", OPTIONAL, 0, 0, "" },
17+
{ "Scraper", OPTIONAL, 0, 0, "" },
18+
19+
{
20+
"Illustration_48x48@1",
21+
MANDATORY,
22+
0, // There are no constraints on the illustration metadata size
23+
0, // in order to avoid decoding it as UTF-8 encoded text
24+
PNG_REGEXP
25+
},
26+
};
27+
28+
METADATA_ASSERT("LongDescription shouldn't be shorter than Description")
29+
{
30+
return !data.has("LongDescription")
31+
|| data["LongDescription"].size() >= data["Description"].size();
32+
}

0 commit comments

Comments
 (0)