From 313a0382b8e366e41371f4c198e3043fab5824ab Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 17 Feb 2023 15:01:40 +0000 Subject: [PATCH 01/14] Add `--jsonl` option that prints only metadata --- CMakeLists.txt | 2 +- src/bilangwriter.cc | 38 ++++++++++++++++++++++++++++++++++++++ src/bilangwriter.hh | 18 ++++++++++++++++-- src/record.cc | 6 ++++-- src/record.hh | 15 ++++++++++++--- src/warcpreprocessor.cc | 18 ++++++++++++++---- src/warcpreprocessor.hh | 5 +++-- src/warcreader.cc | 4 ++++ src/warcreader.hh | 1 + warc2text_main.cc | 5 ++++- 10 files changed, 97 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e90b00..397aeb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 2.8.3) project(warc2text) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_FLAGS "-Wall -Wextra -DBOOST_LOG_DYN_LINK ${CMAKE_CXX_FLAGS}") diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index a6f1936..cc54517 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -2,6 +2,32 @@ #include "util.hh" #include #include +#include + + +namespace { + /** + * Little bit of JSON wrapping to make sure we only ever print safe values + */ + + template + struct JSONValue { + const T& ref; + }; + + template + JSONValue escapeJSON(T const &ref) { + return JSONValue{ref}; + } + + std::ostream &operator<<(std::ostream &out, JSONValue const &val) { + return out << val.ref; + } + + std::ostream &operator<<(std::ostream &out, JSONValue const &val) { + return out << std::quoted(val.ref); + } +} namespace warc2text{ @@ -144,5 +170,17 @@ namespace warc2text{ } } + void JSONLinesWriter::write(const Record& record, bool multilang, bool paragraph_identification) { + // JSON lines format (https://jsonlines.org) + out_ << "{" + << "\"f\":" << escapeJSON(record.getFilename()) << "," + << "\"o\":" << escapeJSON(record.getOffset()) << "," + << "\"rs\":" << escapeJSON(record.getPayload().size()) << "," + << "\"ps\":" << escapeJSON(record.getPlainText().size()) << "," + << "\"l\":" << escapeJSON(record.getLanguage()) << "," + << "\"u\":" << escapeJSON(record.getURL()) << "," + << "\"c\":" << escapeJSON(record.getHTTPcontentType()) + << "}\n"; + } } diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index 3b520f6..4cb6bc5 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -3,11 +3,18 @@ #include #include +#include #include "record.hh" #include "zlib.h" namespace warc2text { + class RecordWriter { + public: + virtual void write(const Record& record, bool multilang = false, bool paragraph_identification = false) = 0; + virtual ~RecordWriter() = default; + }; + class GzipWriter { private: FILE* dest; @@ -27,7 +34,7 @@ namespace warc2text { static const std::size_t BUFFER_SIZE = 4096; }; - class BilangWriter { + class BilangWriter : public RecordWriter { private: std::string folder; std::unordered_map url_files; @@ -57,11 +64,18 @@ namespace warc2text { output_files(output_files) {}; - void write(const Record& record, bool multilang = false, bool paragraph_identification = false); + virtual void write(const Record& record, bool multilang = false, bool paragraph_identification = false); }; + class JSONLinesWriter : public RecordWriter { + private: + std::ostream &out_; + public: + explicit JSONLinesWriter(std::ostream &out) : out_(out) {}; + virtual void write(const Record& record, bool multilang = false, bool paragraph_identification = false); + }; } #endif diff --git a/src/record.cc b/src/record.cc index e2e9615..6e137fe 100644 --- a/src/record.cc +++ b/src/record.cc @@ -34,7 +34,10 @@ namespace warc2text { return header_end + 4; } - Record::Record(const std::string& content) { + Record::Record(const std::string& content, const std::string& filename, std::size_t offset) : + filename(filename), + offset(offset) + { std::string line; std::size_t last_pos = 0, payload_start = 0; std::size_t pos = content.find("WARC/1.0\r\n"); @@ -306,5 +309,4 @@ namespace warc2text { void Record::encodeURL() { url = util::encodeURLs(url); } - } // warc2text diff --git a/src/record.hh b/src/record.hh index b66a740..f41b10d 100644 --- a/src/record.hh +++ b/src/record.hh @@ -14,9 +14,7 @@ namespace warc2text { class Record { public: - Record() {}; - - explicit Record(const std::string& content); + Record(const std::string& content, const std::string &filename, std::size_t offset); const std::string& getHeaderProperty(const std::string& property) const; bool headerExists(const std::string& property) const; @@ -34,6 +32,14 @@ namespace warc2text { bool isBroaderDocumentFormat() const; bool isTextFormat() const; + inline const std::string& getFilename() const { + return filename; + } + + inline std::size_t getOffset() const { + return offset; + } + const std::unordered_map& getTextByLangs() const; int cleanPayload(); @@ -46,6 +52,9 @@ namespace warc2text { void encodeURL(); private: + const std::string &filename; + std::size_t offset; + std::unordered_map header; std::unordered_map HTTPheader; std::string payload; diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index f87fef0..8795aa1 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -1,3 +1,5 @@ + +#include #include "warcpreprocessor.hh" #include "zipreader.hh" #include "util/compress.hh" @@ -11,8 +13,7 @@ namespace warc2text { WARCPreprocessor::WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files, const std::string& pdf_warc_filename, const std::string& tagFiltersFile, bool invert, const std::string& urlFiltersFile, bool multilang, bool encodeURLs, - bool paragraph_identification) : - writer(outputFolder, output_files), + bool paragraph_identification, bool jsonl) : totalRecords(0), textRecords(0), langRecords(0), @@ -25,6 +26,13 @@ namespace warc2text { multilang(multilang), encodeURLs(encodeURLs), paragraph_identification(paragraph_identification) { + if (jsonl) + writer = std::make_unique(std::cout); + else if (!output_files.empty()) + writer = std::make_unique(outputFolder, output_files); + else + std::exit(1); + if (!tagFiltersFile.empty()) util::readTagFiltersRegex(tagFiltersFile, tagFilters); @@ -54,6 +62,7 @@ namespace warc2text { BOOST_LOG_TRIVIAL(info) << "Processing " << filename; WARCReader reader(filename); + std::size_t offset; std::string content; bool done = false; int n_langs = 0; @@ -62,11 +71,12 @@ namespace warc2text { WARCWriter pdf_warc_writer; while (!done) { + offset = reader.tell(); done = !reader.getRecord(content); if (done or content.empty()) continue; - Record record(content); + Record record(content, filename, offset); if (record.getPayload().empty()) continue; @@ -160,7 +170,7 @@ namespace warc2text { langRecords += n_langs; - writer.write(record, multilang, paragraph_identification); + writer->write(record, multilang, paragraph_identification); } pdf_warc_writer.close(); } diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh index 8e8548f..673dfcb 100644 --- a/src/warcpreprocessor.hh +++ b/src/warcpreprocessor.hh @@ -5,6 +5,7 @@ #include "warcreader.hh" #include "bilangwriter.hh" #include "util.hh" +#include #include #include #include @@ -24,7 +25,7 @@ namespace warc2text { class WARCPreprocessor { private: - BilangWriter writer; + std::unique_ptr writer; unsigned int totalRecords; unsigned int textRecords; unsigned int langRecords; @@ -46,7 +47,7 @@ namespace warc2text { explicit WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files = {}, const std::string& pdf_warc_filename = "", const std::string& tagFiltersFile = "", bool invert = false, const std::string& urlFiltersFile = "", bool multilang = false, - bool encodeURLs = false, bool paragraph_identification = false); + bool encodeURLs = false, bool paragraph_identification = false, bool jsonl = false); void process(const std::string &filename); void printStatistics() const; }; diff --git a/src/warcreader.cc b/src/warcreader.cc index d78bcce..1d34853 100644 --- a/src/warcreader.cc +++ b/src/warcreader.cc @@ -100,4 +100,8 @@ namespace warc2text { return len; } + std::size_t WARCReader::tell() const { + return std::ftell(file) - s.avail_in; + } + } // warc2text diff --git a/src/warcreader.hh b/src/warcreader.hh index d3d3107..ebc1c6d 100644 --- a/src/warcreader.hh +++ b/src/warcreader.hh @@ -10,6 +10,7 @@ namespace warc2text { WARCReader(); explicit WARCReader(const std::string& filename); bool getRecord(std::string& out, std::size_t max_size = 1024*1024*20); //20MB + std::size_t tell() const; ~WARCReader(); private: std::FILE* file; diff --git a/warc2text_main.cc b/warc2text_main.cc index dac24e4..3f8f417 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -25,6 +25,7 @@ struct Options { std::string url_filters_filename; bool multilang{}; bool encodeURLs{}; + bool jsonl{}; }; void parseArgs(int argc, char *argv[], Options& out) { @@ -43,6 +44,7 @@ void parseArgs(int argc, char *argv[], Options& out) { ("verbose,v", po::bool_switch(&out.verbose)->default_value(false), "Verbosity level") ("silent,s", po::bool_switch(&out.silent)->default_value(false)) ("multilang", po::bool_switch(&out.multilang)->default_value(false), "Detect multiple languages in a single record") + ("jsonl", po::bool_switch(&out.jsonl)->default_value(false), "Output jsonl to stdout") ("encode-urls", po::bool_switch(&out.encodeURLs)->default_value(false), "Encode URLs obtained from WARC records"); po::positional_options_description pd; @@ -69,6 +71,7 @@ void parseArgs(int argc, char *argv[], Options& out) { " --encode-urls Encode URLs obtained from WARC records\n" " --paragraph-identification Add paragraph index for each sentence extracted from the html\n" " -s Only output errors\n" + " --jsonl Write JSONLines to stdout\n" " -v Verbose output (print trace)\n\n"; exit(1); } @@ -97,7 +100,7 @@ int main(int argc, char *argv[]) { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); WARCPreprocessor warcpproc(options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename, options.tag_filters_invert, options.url_filters_filename, options.multilang, - options.encodeURLs, options.paragraph_identification); + options.encodeURLs, options.paragraph_identification, options.jsonl); for (const std::string& file : options.warcs){ warcpproc.process(file); } From c6a193c249be7cdad5543e97ea006dbe29d03e6d Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Mon, 13 Mar 2023 13:14:33 +0000 Subject: [PATCH 02/14] Remove ownership of RecordWriter from WARCPreprocessor WARCPreprocessor is already complicated enough as is. No need to pass in all those options just to construct a writer. --- src/warcpreprocessor.cc | 15 +++++---------- src/warcpreprocessor.hh | 6 +++--- warc2text_main.cc | 14 ++++++++++++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index 8795aa1..9582c9a 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -1,5 +1,6 @@ #include +#include "src/bilangwriter.hh" #include "warcpreprocessor.hh" #include "zipreader.hh" #include "util/compress.hh" @@ -10,10 +11,11 @@ namespace warc2text { const std::unordered_set WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3", ".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" }; - WARCPreprocessor::WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files, + WARCPreprocessor::WARCPreprocessor(RecordWriter &writer, const std::string& pdf_warc_filename, const std::string& tagFiltersFile, bool invert, const std::string& urlFiltersFile, bool multilang, bool encodeURLs, - bool paragraph_identification, bool jsonl) : + bool paragraph_identification) : + writer(writer), totalRecords(0), textRecords(0), langRecords(0), @@ -26,13 +28,6 @@ namespace warc2text { multilang(multilang), encodeURLs(encodeURLs), paragraph_identification(paragraph_identification) { - if (jsonl) - writer = std::make_unique(std::cout); - else if (!output_files.empty()) - writer = std::make_unique(outputFolder, output_files); - else - std::exit(1); - if (!tagFiltersFile.empty()) util::readTagFiltersRegex(tagFiltersFile, tagFilters); @@ -170,7 +165,7 @@ namespace warc2text { langRecords += n_langs; - writer->write(record, multilang, paragraph_identification); + writer.write(record, multilang, paragraph_identification); } pdf_warc_writer.close(); } diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh index 673dfcb..f035fd1 100644 --- a/src/warcpreprocessor.hh +++ b/src/warcpreprocessor.hh @@ -25,7 +25,7 @@ namespace warc2text { class WARCPreprocessor { private: - std::unique_ptr writer; + RecordWriter &writer; unsigned int totalRecords; unsigned int textRecords; unsigned int langRecords; @@ -44,10 +44,10 @@ namespace warc2text { bool URLfilter(const std::string& url); public: - explicit WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files = {}, + explicit WARCPreprocessor(RecordWriter &writer, const std::string& pdf_warc_filename = "", const std::string& tagFiltersFile = "", bool invert = false, const std::string& urlFiltersFile = "", bool multilang = false, - bool encodeURLs = false, bool paragraph_identification = false, bool jsonl = false); + bool encodeURLs = false, bool paragraph_identification = false); void process(const std::string &filename); void printStatistics() const; }; diff --git a/warc2text_main.cc b/warc2text_main.cc index 3f8f417..3456a21 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -97,10 +97,20 @@ int main(int argc, char *argv[]) { boost::algorithm::split(files_list, options.files, [](char c) {return c == ',';}); std::unordered_set output_files(files_list.begin(), files_list.end()); + std::unique_ptr writer; + if (options.jsonl) { + writer = std::make_unique(std::cout); + } else if (!output_files.empty()) { + writer = std::make_unique(options.output, output_files); + } else { + BOOST_LOG_TRIVIAL(error) << "No output files specified"; + abort(); + } + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - WARCPreprocessor warcpproc(options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename, + WARCPreprocessor warcpproc(*writer, options.pdf_warc_filename, options.tag_filters_filename, options.tag_filters_invert, options.url_filters_filename, options.multilang, - options.encodeURLs, options.paragraph_identification, options.jsonl); + options.encodeURLs, options.paragraph_identification); for (const std::string& file : options.warcs){ warcpproc.process(file); } From 23c1a74a2607212bec56420be8cd9ae5b3d59800 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 16 Mar 2023 18:19:56 +0000 Subject: [PATCH 03/14] Write compressed record size --- src/bilangwriter.cc | 1 + src/record.cc | 3 ++- src/record.hh | 9 +++++++-- src/warcpreprocessor.cc | 18 +++++++++++------- src/warcreader.cc | 14 +++++++------- src/warcreader.hh | 2 +- 6 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index 5d7747c..b6a09d7 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -160,6 +160,7 @@ namespace warc2text{ out_ << "{" << "\"f\":" << escapeJSON(record.getFilename()) << "," << "\"o\":" << escapeJSON(record.getOffset()) << "," + << "\"s\":" << escapeJSON(record.getSize()) << "," << "\"rs\":" << escapeJSON(record.getPayload().size()) << "," << "\"ps\":" << escapeJSON(record.getPlainText().size()) << "," << "\"l\":" << escapeJSON(record.getLanguage()) << "," diff --git a/src/record.cc b/src/record.cc index 746fc56..58fcf51 100644 --- a/src/record.cc +++ b/src/record.cc @@ -34,8 +34,9 @@ namespace warc2text { return header_end + 4; } - Record::Record(const std::string& content, const std::string& filename, std::size_t offset) : + Record::Record(const std::string& content, const std::string& filename, std::size_t size, std::size_t offset) : filename(filename), + size(size), offset(offset) { std::string line; diff --git a/src/record.hh b/src/record.hh index b510622..1bc1be5 100644 --- a/src/record.hh +++ b/src/record.hh @@ -14,7 +14,7 @@ namespace warc2text { class Record { public: - Record(const std::string& content, const std::string &filename, std::size_t offset); + Record(const std::string& content, const std::string &filename, std::size_t size, std::size_t offset); const std::string& getHeaderProperty(const std::string& property) const; bool headerExists(const std::string& property) const; @@ -36,6 +36,10 @@ namespace warc2text { return filename; } + inline std::size_t getSize() const { + return size; + } + inline std::size_t getOffset() const { return offset; } @@ -53,7 +57,8 @@ namespace warc2text { private: const std::string &filename; - std::size_t offset; + std::size_t size; // compressed record length in WARC + std::size_t offset; // byte offset of start of record in WARC std::unordered_map header; std::unordered_map HTTPheader; diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index 926dfae..30ac885 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -58,21 +58,25 @@ namespace warc2text { BOOST_LOG_TRIVIAL(info) << "Processing " << filename; WARCReader reader(filename); - std::size_t offset; std::string content; - bool done = false; int n_langs = 0; bool pdfpass = !pdf_warc_filename.empty(); WARCWriter pdf_warc_writer; - while (!done) { - offset = reader.tell(); - done = !reader.getRecord(content); - if (done or content.empty()) + while (true) { + std::size_t offset = reader.tell(); + std::size_t size = reader.getRecord(content); + + // No more records (EOF or failure to inflate) + if (size == 0) + break; + + // Skipped record (i.e. larger than max_size) + if (content.empty()) continue; - Record record(content, filename, offset); + Record record(content, filename, size, offset); if (record.getPayload().empty()) continue; diff --git a/src/warcreader.cc b/src/warcreader.cc index 1d34853..c2bff4d 100644 --- a/src/warcreader.cc +++ b/src/warcreader.cc @@ -33,18 +33,18 @@ namespace warc2text { closeFile(); } - bool WARCReader::getRecord(std::string& out, std::size_t max_size){ + std::size_t WARCReader::getRecord(std::string& out, std::size_t max_size){ int inflate_ret = 0; out.clear(); - std::size_t len; + std::size_t offset = tell(); bool skip_record = false; while (inflate_ret != Z_STREAM_END) { if (s.avail_in == 0) { - len = readChunk(); + std::size_t len = readChunk(); if (len <= 0) { // nothing more to read out.clear(); - return false; + return 0; } s.avail_in = len; s.next_in = buf; @@ -57,7 +57,7 @@ namespace warc2text { if (inflate_ret != Z_OK && inflate_ret != Z_STREAM_END) { BOOST_LOG_TRIVIAL(error) << "WARC " << warc_filename << ": error during decompressing"; out.clear(); - return false; + return 0; } if (not skip_record) out.append(scratch, scratch + (BUFFER_SIZE - s.avail_out)); if (out.size() > max_size) { @@ -74,7 +74,7 @@ namespace warc2text { // next in and avail_in are updated while inflating, so no need to update them manually } } - return true; + return tell() - offset; } void WARCReader::openFile(const std::string& filename){ @@ -101,7 +101,7 @@ namespace warc2text { } std::size_t WARCReader::tell() const { - return std::ftell(file) - s.avail_in; + return std::ftell(const_cast(file)) - s.avail_in; } } // warc2text diff --git a/src/warcreader.hh b/src/warcreader.hh index ebc1c6d..62685c7 100644 --- a/src/warcreader.hh +++ b/src/warcreader.hh @@ -9,7 +9,7 @@ namespace warc2text { public: WARCReader(); explicit WARCReader(const std::string& filename); - bool getRecord(std::string& out, std::size_t max_size = 1024*1024*20); //20MB + std::size_t getRecord(std::string& out, std::size_t max_size = 1024*1024*20); //20MB std::size_t tell() const; ~WARCReader(); private: From b7acdaec3dfbd40a2242a12a2f00370730c0bb4b Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 16 Mar 2023 18:37:38 +0000 Subject: [PATCH 04/14] Add optional `file` output file Which contains the `warc_path:offset:size` for each line. --- src/bilangwriter.cc | 12 +++++++++--- src/bilangwriter.hh | 15 ++++----------- warc2text_main.cc | 2 +- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index b6a09d7..bf8d154 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -98,13 +98,15 @@ namespace warc2text{ return dest != nullptr; } - void BilangWriter::write(const std::string& lang, const std::string& b64text, const std::string& url, const std::string& mime, const std::string& b64html) { + void BilangWriter::write(const std::string& lang, const std::string& b64text, const std::string& url, const std::string& mime, const std::string& b64html, const std::string& file) { GzipWriter* gzurl = &url_files[lang]; GzipWriter* gztext = &text_files[lang]; GzipWriter* gzmime = nullptr; GzipWriter* gzhtml = nullptr; + GzipWriter* gzfile = nullptr; if (output_files.count("mime") == 1) gzmime = &(mime_files[lang]); if (output_files.count("html") == 1) gzhtml = &(html_files[lang]); + if (output_files.count("file") == 1) gzfile = &(file_files[lang]); if (!gzurl->is_open()) { // if one file does not exist, the rest shouldn't either std::string path = folder + "/" + lang; @@ -113,12 +115,14 @@ namespace warc2text{ gztext->open(path + "/text.gz"); if (gzmime != nullptr) gzmime->open(path + "/mime.gz"); if (gzhtml != nullptr) gzhtml->open(path + "/html.gz"); + if (gzfile != nullptr) gzfile->open(path + "/file.gz"); } gzurl->writeLine(url); gztext->writeLine(b64text); if (gzmime != nullptr) gzmime->writeLine(mime); if (gzhtml != nullptr) gzhtml->writeLine(b64html); + if (gzfile != nullptr) gzfile->writeLine(file); } std::string get_paragraph_id(const std::string& text) { @@ -143,6 +147,8 @@ namespace warc2text{ if (output_files.count("html") == 1) util::encodeBase64(record.getPayload(), base64html); + std::string file = record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize()); + for (const auto& it : record.getTextByLangs()) { std::string payload = it.second; @@ -151,11 +157,11 @@ namespace warc2text{ } util::encodeBase64(payload, base64text); - this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html); + this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html, file); } } - void JSONLinesWriter::write(const Record& record, bool paragraph_identification) { + void JSONLinesWriter::write(const Record& record, [[maybe_unused]] bool paragraph_identification) { // JSON lines format (https://jsonlines.org) out_ << "{" << "\"f\":" << escapeJSON(record.getFilename()) << "," diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index d3f7946..265200d 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -41,26 +41,19 @@ namespace warc2text { std::unordered_map mime_files; std::unordered_map text_files; std::unordered_map html_files; + std::unordered_map file_files; std::unordered_set output_files; - void write(const std::string& lang, const std::string& b64text, const std::string& url, const std::string& mime, const std::string& b64html); + void write(const std::string& lang, const std::string& b64text, const std::string& url, const std::string& mime, const std::string& b64html, const std::string& file); public: - explicit BilangWriter(const std::string& folder) : - folder(folder), - url_files(), - mime_files(), - text_files(), - html_files(), - output_files({}) // url and text are mandatory regardless - {}; - - explicit BilangWriter(const std::string& folder, const std::unordered_set& output_files) : + explicit BilangWriter(const std::string& folder, const std::unordered_set& output_files = {}) : folder(folder), url_files(), mime_files(), text_files(), html_files(), + file_files(), output_files(output_files) {}; diff --git a/warc2text_main.cc b/warc2text_main.cc index 1608209..da62b22 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -64,7 +64,7 @@ void parseArgs(int argc, char *argv[], Options& out) { " -o Output folder, required\n" " -f List of output files separated by commas\n" " Default (mandatory): \"url,text\"\n" - " Optional values: \"mime,html\"\n" + " Optional values: \"mime,html,file\"\n" " --classifier Classifier to use: cld2 or fasttext\n" " --fasttext-model Path to FastText model for fasttext classifier\n" " --multilang Detect multiple languages in documents (up to 3),\n" From 97af6710f2531b77acedb6a41b793b3f538d4d51 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 17 Mar 2023 16:34:49 +0000 Subject: [PATCH 05/14] I'm afraid of bare pointers --- src/warcreader.cc | 37 +++++++++++++------------------------ src/warcreader.hh | 8 +++++--- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/warcreader.cc b/src/warcreader.cc index c2bff4d..7722304 100644 --- a/src/warcreader.cc +++ b/src/warcreader.cc @@ -3,18 +3,13 @@ #include namespace warc2text { - WARCReader::WARCReader(){ - warc_filename = ""; - file = nullptr; - - buf = new uint8_t[BUFFER_SIZE]; - scratch = new uint8_t[BUFFER_SIZE]; - + WARCReader::WARCReader() + { s.zalloc = nullptr; s.zfree = nullptr; s.opaque = nullptr; s.avail_in = 0; - s.next_in = buf; + s.next_in = buf.data(); if (inflateInit2(&s, 32) != Z_OK) { BOOST_LOG_TRIVIAL(error) << "Failed to init zlib"; @@ -27,10 +22,7 @@ namespace warc2text { } WARCReader::~WARCReader(){ - delete[] buf; - delete[] scratch; inflateEnd(&s); - closeFile(); } std::size_t WARCReader::getRecord(std::string& out, std::size_t max_size){ @@ -47,11 +39,11 @@ namespace warc2text { return 0; } s.avail_in = len; - s.next_in = buf; + s.next_in = buf.data(); } // inflate until either stream end is reached, or there is no more data while (inflate_ret != Z_STREAM_END && s.avail_in != 0) { - s.next_out = scratch; + s.next_out = scratch.data(); s.avail_out = BUFFER_SIZE; inflate_ret = inflate(&s, Z_NO_FLUSH); if (inflate_ret != Z_OK && inflate_ret != Z_STREAM_END) { @@ -59,7 +51,7 @@ namespace warc2text { out.clear(); return 0; } - if (not skip_record) out.append(scratch, scratch + (BUFFER_SIZE - s.avail_out)); + if (not skip_record) out.append(scratch.data(), scratch.data() + (scratch.size() - s.avail_out)); if (out.size() > max_size) { BOOST_LOG_TRIVIAL(trace) << "WARC " << warc_filename << ": skipping large record"; out.clear(); @@ -80,20 +72,17 @@ namespace warc2text { void WARCReader::openFile(const std::string& filename){ warc_filename = filename; if (filename.empty() || filename == "-") - file = std::freopen(nullptr, "rb", stdin); // make sure stdin is open in binary mode - else file = std::fopen(filename.c_str(), "r"); - if (!file) { + file.reset(std::freopen(nullptr, "rb", stdin)); // make sure stdin is open in binary mode + else + file.reset(std::fopen(filename.c_str(), "r")); + if (!file.get()) { BOOST_LOG_TRIVIAL(error) << "WARC " << filename << ": file opening failed, skipping this WARC"; } } - void WARCReader::closeFile() { - if (file) std::fclose(file); - } - std::size_t WARCReader::readChunk(){ - std::size_t len = std::fread(buf, sizeof(uint8_t), BUFFER_SIZE, file); - if (std::ferror(file) && !std::feof(file)) { + std::size_t len = std::fread(buf.data(), sizeof(uint8_t), BUFFER_SIZE, file.get()); + if (std::ferror(file.get()) && !std::feof(file.get())) { BOOST_LOG_TRIVIAL(error) << "WARC " << warc_filename << ": error during reading"; return 0; } @@ -101,7 +90,7 @@ namespace warc2text { } std::size_t WARCReader::tell() const { - return std::ftell(const_cast(file)) - s.avail_in; + return std::ftell(const_cast(file.get())) - s.avail_in; } } // warc2text diff --git a/src/warcreader.hh b/src/warcreader.hh index 62685c7..32724ab 100644 --- a/src/warcreader.hh +++ b/src/warcreader.hh @@ -2,7 +2,9 @@ #define WARC2TEXT_WARCREADER_HH #include "zlib.h" +#include #include +#include "util/file.hh" namespace warc2text { class WARCReader { @@ -13,12 +15,12 @@ namespace warc2text { std::size_t tell() const; ~WARCReader(); private: - std::FILE* file; + util::scoped_FILE file; std::string warc_filename; z_stream s{}; static const std::size_t BUFFER_SIZE = 4096; - uint8_t* buf; - uint8_t* scratch; + std::array buf; + std::array scratch; void openFile(const std::string& filename); void closeFile(); From 84d0c4ce0f8a9c4b4376935c1fb6e822e9bce362 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 2 Nov 2023 13:51:31 +0000 Subject: [PATCH 06/14] Reimplement json output using boost, add text --- CMakeLists.txt | 2 +- src/bilangwriter.cc | 48 +++++++++++++-------------------------------- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f3c3ed5..2d235b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif () -find_package(Boost 1.71 COMPONENTS program_options log log_setup REQUIRED) +find_package(Boost 1.75 COMPONENTS program_options json log log_setup REQUIRED) # compile executable into bin/ set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index bf8d154..8c40a27 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -4,32 +4,9 @@ #include #include #include +#include -namespace { - /** - * Little bit of JSON wrapping to make sure we only ever print safe values - */ - - template - struct JSONValue { - const T& ref; - }; - - template - JSONValue escapeJSON(T const &ref) { - return JSONValue{ref}; - } - - std::ostream &operator<<(std::ostream &out, JSONValue const &val) { - return out << val.ref; - } - - std::ostream &operator<<(std::ostream &out, JSONValue const &val) { - return out << std::quoted(val.ref); - } -} - namespace warc2text{ GzipWriter::GzipWriter() { @@ -163,16 +140,19 @@ namespace warc2text{ void JSONLinesWriter::write(const Record& record, [[maybe_unused]] bool paragraph_identification) { // JSON lines format (https://jsonlines.org) - out_ << "{" - << "\"f\":" << escapeJSON(record.getFilename()) << "," - << "\"o\":" << escapeJSON(record.getOffset()) << "," - << "\"s\":" << escapeJSON(record.getSize()) << "," - << "\"rs\":" << escapeJSON(record.getPayload().size()) << "," - << "\"ps\":" << escapeJSON(record.getPlainText().size()) << "," - << "\"l\":" << escapeJSON(record.getLanguage()) << "," - << "\"u\":" << escapeJSON(record.getURL()) << "," - << "\"c\":" << escapeJSON(record.getHTTPcontentType()) - << "}\n"; + for (auto &&chunk : record.getTextByLangs()) { + out_ << boost::json::value{ + {"f", boost::json::string(record.getFilename())}, + {"o", boost::json::value(record.getOffset())}, + {"s", boost::json::value(record.getSize())}, + {"rs", boost::json::value(record.getPayload().size())}, + {"ps", boost::json::value(chunk.second.size())}, + {"l", boost::json::string(chunk.first)}, + {"u", boost::json::string(record.getURL())}, + {"c", boost::json::string(record.getHTTPcontentType())}, + {"t", boost::json::string(chunk.second)}, + } << "\n"; + } } } From 38f7edee6bec93da08984557f029dbcf9f679ec5 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 2 Nov 2023 13:51:48 +0000 Subject: [PATCH 07/14] Remove getLanguage() since it isn't functional anymore --- src/record.cc | 4 ---- src/record.hh | 1 - 2 files changed, 5 deletions(-) diff --git a/src/record.cc b/src/record.cc index 58fcf51..1b6206d 100644 --- a/src/record.cc +++ b/src/record.cc @@ -279,10 +279,6 @@ namespace warc2text { return plaintext; } - const std::string& Record::getLanguage() const { - return language; - } - const std::string& Record::getURL() const { return url; } diff --git a/src/record.hh b/src/record.hh index 1bc1be5..675fcc9 100644 --- a/src/record.hh +++ b/src/record.hh @@ -23,7 +23,6 @@ namespace warc2text { const std::string& getPayload() const; const std::string& getPlainText() const; - const std::string& getLanguage() const; const std::string& getURL() const; const std::string& getRecordType() const; const std::string& getWARCcontentType() const; From 57f7336025bacc261998a4acac03dc11a7a26b09 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 2 Nov 2023 14:05:33 +0000 Subject: [PATCH 08/14] `p` instead of `t` for consistency --- src/bilangwriter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index 8c40a27..71b6eee 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -150,7 +150,7 @@ namespace warc2text{ {"l", boost::json::string(chunk.first)}, {"u", boost::json::string(record.getURL())}, {"c", boost::json::string(record.getHTTPcontentType())}, - {"t", boost::json::string(chunk.second)}, + {"p", boost::json::string(chunk.second)}, } << "\n"; } } From abedb5263f77b681838a276ff2af023d47b10d76 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 2 Nov 2023 15:27:15 +0000 Subject: [PATCH 09/14] Weird default --- warc2text_main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc2text_main.cc b/warc2text_main.cc index da62b22..ed43b0f 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -37,7 +37,7 @@ void parseArgs(int argc, char *argv[], Options& out) { desc.add_options() ("help,h", po::bool_switch(), "Show this help message") ("output,o", po::value(&out.output)->default_value("."), "Output folder") - ("files,f", po::value(&out.files)->default_value("url,token"), "List of output files separated by commas. Default (mandatory files): 'url,text'. Optional: 'mime,html'") + ("files,f", po::value(&out.files)->default_value("url,text"), "List of output files separated by commas. Default: 'url,text'. Optional: 'mime,html,file'") ("input,i", po::value(&out.warcs)->multitoken(), "Input WARC file name(s)") ("tag-filters", po::value(&out.tag_filters_filename), "Plain text file containing tag filters") ("invert-tag-filters", po::bool_switch(&out.tag_filters_invert)->default_value(false), "Invert tag filter application") From 0095b59c6d3eda956a9e204c136083d7463cdb74 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 2 Nov 2023 15:28:16 +0000 Subject: [PATCH 10/14] Rework BilangWriter into a per language LangWriter I know, more classes, but each one is significantly simpler :tada: --- src/bilangwriter.cc | 113 +++++++++++++++++++++----------------------- src/bilangwriter.hh | 49 +++++++++++-------- src/util.cc | 10 ++-- src/util.hh | 4 +- 4 files changed, 90 insertions(+), 86 deletions(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index 71b6eee..08ea184 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -9,27 +9,20 @@ namespace warc2text{ - GzipWriter::GzipWriter() { - dest = nullptr; - compressed = 0; - s.zalloc = nullptr; - s.zfree = nullptr; - s.opaque = nullptr; - int ret = deflateInit2(&s, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); - assert(ret == Z_OK); - buf = new unsigned char[BUFFER_SIZE]; + GzipWriter::GzipWriter() + : dest(nullptr), + buf(new unsigned char[BUFFER_SIZE]) { + // } GzipWriter::~GzipWriter() { - if (dest) { - this->compress("", 0, Z_FINISH); - deflateEnd(&s); - std::fclose(dest); - } + if (is_open()) + close(); delete[] buf; } void GzipWriter::compress(const char *in, std::size_t size, int flush) { + assert(is_open()); if (size == 0 && flush == Z_NO_FLUSH) return; s.avail_in = size; s.next_in = (Bytef *) in; @@ -42,7 +35,7 @@ namespace warc2text{ s.next_out = buf; ret = deflate(&s, flush); assert(ret == Z_OK || ret == Z_STREAM_END); // Z_STREAM_END only happens if flush == Z_FINISH - compressed = BUFFER_SIZE - s.avail_out; + std::size_t compressed = BUFFER_SIZE - s.avail_out; //written = std::fwrite(buf, 1, compressed, dest); std::fwrite(buf, 1, compressed, dest); // TODO error handling @@ -55,51 +48,64 @@ namespace warc2text{ void GzipWriter::open(const std::string& filename) { dest = std::fopen(filename.c_str(), "wb"); UTIL_THROW_IF(!dest, util::ErrnoException, "while creating " << filename); + s.zalloc = nullptr; + s.zfree = nullptr; + s.opaque = nullptr; + int ret = deflateInit2(&s, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + assert(ret == Z_OK); + } + + void GzipWriter::close() { + compress("", 0, Z_FINISH); + deflateEnd(&s); + std::fclose(dest); + dest = nullptr; } void GzipWriter::write(const char* text, std::size_t size) { - this->compress(text, size, Z_NO_FLUSH); + compress(text, size, Z_NO_FLUSH); } void GzipWriter::writeLine(const char* text, std::size_t size) { - this->compress(text, size, Z_NO_FLUSH); - this->compress("\n", 1, Z_NO_FLUSH); + compress(text, size, Z_NO_FLUSH); + compress("\n", 1, Z_NO_FLUSH); } void GzipWriter::writeLine(const std::string& text) { - this->compress(text.c_str(), text.size(), Z_NO_FLUSH); - this->compress("\n", 1, Z_NO_FLUSH); + compress(text.c_str(), text.size(), Z_NO_FLUSH); + compress("\n", 1, Z_NO_FLUSH); } bool GzipWriter::is_open(){ return dest != nullptr; } - void BilangWriter::write(const std::string& lang, const std::string& b64text, const std::string& url, const std::string& mime, const std::string& b64html, const std::string& file) { - GzipWriter* gzurl = &url_files[lang]; - GzipWriter* gztext = &text_files[lang]; - GzipWriter* gzmime = nullptr; - GzipWriter* gzhtml = nullptr; - GzipWriter* gzfile = nullptr; - if (output_files.count("mime") == 1) gzmime = &(mime_files[lang]); - if (output_files.count("html") == 1) gzhtml = &(html_files[lang]); - if (output_files.count("file") == 1) gzfile = &(file_files[lang]); - if (!gzurl->is_open()) { - // if one file does not exist, the rest shouldn't either - std::string path = folder + "/" + lang; - util::createDirectories(path); - gzurl->open(path + "/url.gz"); - gztext->open(path + "/text.gz"); - if (gzmime != nullptr) gzmime->open(path + "/mime.gz"); - if (gzhtml != nullptr) gzhtml->open(path + "/html.gz"); - if (gzfile != nullptr) gzfile->open(path + "/file.gz"); - } + LangWriter::LangWriter(const std::string& path, const std::unordered_set& output_files) { + util::createDirectories(path); + + if (output_files.count("url")) + url_file.open(path + "/url.gz"); + if (output_files.count("text")) + text_file.open(path + "/text.gz"); + if (output_files.count("mime")) + mime_file.open(path + "/mime.gz"); + if (output_files.count("html")) + html_file.open(path + "/html.gz"); + if (output_files.count("file")) + file_file.open(path + "/file.gz"); + } - gzurl->writeLine(url); - gztext->writeLine(b64text); - if (gzmime != nullptr) gzmime->writeLine(mime); - if (gzhtml != nullptr) gzhtml->writeLine(b64html); - if (gzfile != nullptr) gzfile->writeLine(file); + void LangWriter::write(Record const &record, std::string const &chunk) { + if (url_file.is_open()) + url_file.writeLine(record.getURL()); + if (mime_file.is_open()) + mime_file.writeLine(record.getHTTPcontentType()); + if (file_file.is_open()) + file_file.writeLine(record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize())); + if (html_file.is_open()) + html_file.writeLine(util::encodeBase64(record.getPayload())); + if (text_file.is_open()) + text_file.writeLine(util::encodeBase64(chunk)); } std::string get_paragraph_id(const std::string& text) { @@ -118,23 +124,14 @@ namespace warc2text{ } void BilangWriter::write(const Record& record, bool paragraph_identification) { - std::string base64text; - std::string base64html; - - if (output_files.count("html") == 1) - util::encodeBase64(record.getPayload(), base64html); - - std::string file = record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize()); - for (const auto& it : record.getTextByLangs()) { - std::string payload = it.second; + std::string chunk = it.second; - if (paragraph_identification) { - payload = get_paragraph_id(payload); - } + if (paragraph_identification) + chunk = get_paragraph_id(chunk); - util::encodeBase64(payload, base64text); - this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html, file); + auto writer_it = writers.try_emplace(it.first, folder + "/" + it.first, output_files); + writer_it.first->second.write(record, chunk); } } diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index 265200d..bb1b6c8 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -9,24 +9,31 @@ namespace warc2text { + /** + * Generic interface for writing records to some form of output. + */ class RecordWriter { public: virtual void write(const Record& record, bool paragraph_identification = false) = 0; virtual ~RecordWriter() = default; }; + /** + * Writer used by BilangWriter to write a single compressed file + * (i.e. a column for a specific language) + */ class GzipWriter { private: FILE* dest; z_stream s{}; unsigned char* buf; - std::size_t compressed; void compress(const char* in, std::size_t size, int flush); public: GzipWriter(); ~GzipWriter(); void open(const std::string& filename); + void close(); void write(const char* text, std::size_t size); void writeLine(const char* text, std::size_t size); void writeLine(const std::string& text); @@ -34,31 +41,35 @@ namespace warc2text { static const std::size_t BUFFER_SIZE = 4096; }; + /** + * Writes records to a specific folder for a specific language. + */ + class LangWriter { + private: + GzipWriter url_file; + GzipWriter mime_file; + GzipWriter text_file; + GzipWriter html_file; + GzipWriter file_file; + public: + LangWriter(const std::string& folder, const std::unordered_set& output_files); + void write(const Record& record, const std::string &chunk); + }; + class BilangWriter : public RecordWriter { private: std::string folder; - std::unordered_map url_files; - std::unordered_map mime_files; - std::unordered_map text_files; - std::unordered_map html_files; - std::unordered_map file_files; std::unordered_set output_files; - - void write(const std::string& lang, const std::string& b64text, const std::string& url, const std::string& mime, const std::string& b64html, const std::string& file); - + std::unordered_map writers; public: - explicit BilangWriter(const std::string& folder, const std::unordered_set& output_files = {}) : - folder(folder), - url_files(), - mime_files(), - text_files(), - html_files(), - file_files(), - output_files(output_files) - {}; + BilangWriter(const std::string& folder, const std::unordered_set& output_files = {}) + : folder(folder) + , output_files(output_files) + { + // + }; virtual void write(const Record& record, bool paragraph_identification = false); - }; class JSONLinesWriter : public RecordWriter { diff --git a/src/util.cc b/src/util.cc index 01ff168..36e2e6a 100644 --- a/src/util.cc +++ b/src/util.cc @@ -82,12 +82,10 @@ namespace util { return boost::locale::conv::to_utf(text, charset); } - void encodeBase64(const std::string& original, std::string& base64){ - preprocess::base64_encode(original, base64); - } - - void decodeBase64(const std::string& base64, std::string& output){ - preprocess::base64_decode(base64, output); + std::string encodeBase64(const std::string &original) { + std::string out; + preprocess::base64_encode(original, out); + return out; } void readTagFiltersRegex(const std::string& filename, umap_tag_filters_regex& filters) { diff --git a/src/util.hh b/src/util.hh index a6f5702..dd4407c 100644 --- a/src/util.hh +++ b/src/util.hh @@ -25,9 +25,7 @@ namespace util { std::string toUTF8 (const std::string& text, const std::string& charset); std::string toUTF8 (const char* text, const std::string& charset); - void encodeBase64(const std::string& original, std::string& base64); - - void decodeBase64(const std::string& base64, std::string& output); + std::string encodeBase64(const std::string& original); const std::string reserved_chars_url("!#$&'()*+,/:;=?[]"); std::string encodeURLs(const std::string& url); From 6cb12b9f379a4208a345c96c28e1dece141bb84e Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 3 Nov 2023 17:56:56 +0000 Subject: [PATCH 11/14] Add crawl date output --- src/bilangwriter.cc | 5 +++++ src/bilangwriter.hh | 1 + src/record.cc | 8 ++++++++ src/record.hh | 2 ++ warc2text_main.cc | 2 +- 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index ac8c96c..5688000 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -93,6 +93,8 @@ namespace warc2text{ html_file.open(path + "/html.gz"); if (output_files.count("file")) file_file.open(path + "/file.gz"); + if (output_files.count("date")) + date_file.open(path + "/date.gz"); } void LangWriter::write(Record const &record, std::string const &chunk) { @@ -102,6 +104,8 @@ namespace warc2text{ mime_file.writeLine(record.getHTTPcontentType()); if (file_file.is_open()) file_file.writeLine(record.getFilename() + ":" + std::to_string(record.getOffset()) + ":" + std::to_string(record.getSize())); + if (date_file.is_open()) + date_file.writeLine(record.getWARCdate()); if (html_file.is_open()) html_file.writeLine(util::encodeBase64(record.getPayload())); if (text_file.is_open()) @@ -147,6 +151,7 @@ namespace warc2text{ {"l", boost::json::string(chunk.first)}, {"u", boost::json::string(record.getURL())}, {"c", boost::json::string(record.getHTTPcontentType())}, + {"ts", boost::json::string(record.getWARCdate())}, {"p", boost::json::string(chunk.second)}, } << "\n"; } diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index bb1b6c8..cf80d52 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -51,6 +51,7 @@ namespace warc2text { GzipWriter text_file; GzipWriter html_file; GzipWriter file_file; + GzipWriter date_file; public: LangWriter(const std::string& folder, const std::unordered_set& output_files); void write(const Record& record, const std::string &chunk); diff --git a/src/record.cc b/src/record.cc index 1b6206d..2a3806f 100644 --- a/src/record.cc +++ b/src/record.cc @@ -75,6 +75,10 @@ namespace warc2text { util::toLower(WARCcontentType); } + if (header.count("warc-date") == 1) { + WARCdate = header["warc-date"]; + } + payload_start = last_pos; if (header["warc-type"] == "response") { // parse HTTP header @@ -287,6 +291,10 @@ namespace warc2text { return recordType; } + const std::string& Record::getWARCdate() const { + return WARCdate; + } + const std::string& Record::getWARCcontentType() const { return WARCcontentType; } diff --git a/src/record.hh b/src/record.hh index 675fcc9..00069e7 100644 --- a/src/record.hh +++ b/src/record.hh @@ -26,6 +26,7 @@ namespace warc2text { const std::string& getURL() const; const std::string& getRecordType() const; const std::string& getWARCcontentType() const; + const std::string& getWARCdate() const; const std::string& getHTTPcontentType() const; const std::string& getCharset() const; bool isBroaderDocumentFormat() const; @@ -70,6 +71,7 @@ namespace warc2text { // these are present in the headers, but it's convenient to have them apart also std::string recordType; std::string WARCcontentType; + std::string WARCdate; std::string cleanHTTPcontentType; std::string charset; std::string url; diff --git a/warc2text_main.cc b/warc2text_main.cc index 37b4812..6a839b9 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -57,7 +57,7 @@ void parseArgs(int argc, char *argv[], Options& out) { " -o Output folder, required\n" " -f List of output files separated by commas\n" " Default (mandatory): \"url,text\"\n" - " Optional values: \"mime,html,file\"\n" + " Optional values: \"mime,html,file,date\"\n" " --classifier Classifier to use: cld2 or fasttext\n" " --fasttext-model Path to FastText model for fasttext classifier\n" " --multilang Detect multiple languages in documents (up to 3),\n" From 1217d71b5de35771452af45d4627b7583495dcb4 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 3 Nov 2023 17:57:27 +0000 Subject: [PATCH 12/14] Little optimisations to record parsing --- src/record.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/record.cc b/src/record.cc index 2a3806f..9df06c3 100644 --- a/src/record.cc +++ b/src/record.cc @@ -65,10 +65,11 @@ namespace warc2text { if (header.count("warc-target-uri") == 1) { // respect the original casing url = header["warc-target-uri"]; - } - if (!url.empty() && url[0] == '<' && url[url.size()-1] == '>') - url = url.substr(1, url.size()-2); + // Remove any "<" and ">" wrappings from the URL + if (!url.empty() && url[0] == '<' && url[url.size()-1] == '>') + url = url.substr(1, url.size()-2); + } if (header.count("content-type") == 1) { WARCcontentType = header["content-type"]; @@ -80,7 +81,7 @@ namespace warc2text { } payload_start = last_pos; - if (header["warc-type"] == "response") { + if (recordType == "response") { // parse HTTP header pos = content.find("HTTP/1.", last_pos); if (pos == last_pos) { // found HTTP header From 80993e8d167bdf2c198043c83081552ba0ddebd1 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 21 Nov 2023 11:59:22 +0000 Subject: [PATCH 13/14] Update cli flags and describe output --- README.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 93c2efe..08cdce1 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,12 @@ warc2text -o [ -f ] [ --pdfpass ] [ --paragraph-identification ] [ --tag-filters ] ... ``` * `--output`/`-o` output folder -* `--files`/`-f` list of output files separated by commas (and without `.gz`); `text` and `url` are always written, while `mime` and `html` are optional +* `--files`/`-f` list of output files separated by commas (and without `.gz`); Options are `text`,`html`,`url`,`mime`,`file` and `date`. Defaults to `text,url`. See [output](#output). +* `--jsonl` Produce JSON Lines on stdout instead of writing to files per language. * `--pdfpass` WARC file where PDF records will be stored +* `--robotstxtpass` WARC file where robots.txt related records will be stored +* `--encode-urls` Escape non-ascii characters that appear in the record URL with `%dd` encoding. +* `--multilang` Detect multiple languages in the document, and split the document accordingly. Only supported with CLD2 classifier. * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML * `--classifier` classifier to use: `cld2` or `fasttext`. * `--fasttext-model` path to FastText model for fasttext classifier. @@ -61,6 +65,39 @@ warc2text -o [ -f ] [ --pdfpass ] Lines beginning with `#` and empty lines are ignored. Any invalid filter will raise a warning message, but will not prevent other filters from being read. +## Output +When used with `--output`/`-o` (with optionally `--files`/`-f`), warc2text will +produce the following directory structure at the path specified by `--output`: + +- `./{lang}/text.gz` will contain the plain text per document as base64 encoded lines. E.g. `gzip -cd en/text.gz | head -n5 | tail -n1 | base64 -d` will give you the 5th document's text. +- `./{lang}/url.gz` contains [the crawled URL](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warc-target-uri) for each record. +- `./{lang}/mime.gz` contains the mimetype as reported by the crawled server +- `./{lang}/html.gz` contains lines of base64 encoded HTML as returned by the server. For ePub, MS Office or ODF files this is the extracted XML. +- `./{lang}/file.gz` contains the `{filename}:{offset}:{length}` pointer to the warc archive the record was extracted from. `{offset}` and `{length}` are of the compressed data, e.g. `tail -c+{offset} < {filename} | head -c{length} | gzip -cd` will give you the original record. +- `./{lang}/date.gz` gives you the original crawl date/time as reported by the crawler. [This should be a UTC timestamp](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warc-date-mandatory). + +In every file, each line corresponds to the same record. E.g. the fifth line in `text.gz` and fifth line in `url.gz` together give you the text and url for a single record. + +The `{lang}` part of the path is determined by the classifier (see `--classifier`) and may be a two-letter or three-letter code depending on the classifier used. See [this list](https://github.com/CLD2Owners/cld2/blob/b56fa78a2fe44ac2851bae5bf4f4693a0644da7b/internal/generated_language.cc#L647-L1262) for CLD2. + +When using `--jsonl`, the output is instead a single JSON record per line, with the following keys (always in this order): +```ts +{ + f: string, # filename of warc file (same as the `{filename}` part in `file.gz`) + o: number, # byte offset of record in warc file (same as `{offset}` in `file.gz`) + s: number, # warc file record size (same as `{size}` in `file.gz`) + rs: number, # byte size of record payload (uncompressed) + ps: number, # byte size of text only payload (so compare this against `rs` and you should get amount of HTML removed) + l: string, # identified language by classifier + u: string, # url + c: string, # content type as reported by the HTTP response header (or warc record header if that isn't present) + ts: string, # crawl date/time as reported by the crawler + p: string, # plain text +} +``` + +More keys might be added in the future (e.g. the raw HTML is not included now) and you should not expect the order of the keys to stay the same between different versions of warc2text. + ## Included dependencies HTML Tokenizer by [c-smile](https://www.codeproject.com/Articles/14076/Fast-and-Compact-HTML-XML-Scanner-Tokenizer) From deeeaf9338c5cfb210404135a25fcafae67d69c7 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 20 Dec 2023 23:34:16 +0100 Subject: [PATCH 14/14] Document `--fasttext-model` better --- README.md | 4 ++-- warc2text_main.cc | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 08cdce1..f030635 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,8 @@ warc2text -o [ -f ] [ --pdfpass ] * `--encode-urls` Escape non-ascii characters that appear in the record URL with `%dd` encoding. * `--multilang` Detect multiple languages in the document, and split the document accordingly. Only supported with CLD2 classifier. * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML -* `--classifier` classifier to use: `cld2` or `fasttext`. -* `--fasttext-model` path to FastText model for fasttext classifier. +* `--classifier` classifier to use: `cld2` or `fasttext`. When `fasttext` is used, one also has to specify a model using `--fasttext-model`. +* `--fasttext-model` path to FastText model for fasttext classifier. Models can be any [FastText language identification model](https://fasttext.cc/docs/en/language-identification.html) such as [OpenLID lid201-model.ftz](https://github.com/laurieburchell/open-lid-dataset#quantised-model) * `--tag-filters` file containing filters that are used to eliminate matching documents * `--invert-tag-filters` output only documents that match the filter * `--url-filters` file containing regular expressions that match urls of documents to eliminate diff --git a/warc2text_main.cc b/warc2text_main.cc index 6a839b9..2c7ff80 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -119,6 +119,9 @@ int main(int argc, char *argv[]) { if (options.multilang) { BOOST_LOG_TRIVIAL(error) << "FastText classifier doesn't do multilang at the moment"; abort(); + } else if (options.fasttext_model.empty()) { + BOOST_LOG_TRIVIAL(error) << "No FastText language identification model specified. Use --fasttext-model"; + abort(); } else { detector.reset(new FastTextDetector(options.fasttext_model)); }