Skip to content

Commit

Permalink
starting to work on fetching full article content
Browse files Browse the repository at this point in the history
  • Loading branch information
martinrotter committed Apr 22, 2024
1 parent d8525c3 commit c7c069d
Show file tree
Hide file tree
Showing 9 changed files with 238 additions and 1 deletion.
1 change: 1 addition & 0 deletions resources/rssguard.qrc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

<file>scripts/adblock/adblock-server.js</file>
<file>scripts/readability/readabilize-article.js</file>
<file>scripts/article-extractor/extract-article.mjs</file>

<file>scripts/filters/blacklist.js</file>
<file>scripts/filters/whitelist.js</file>
Expand Down
11 changes: 11 additions & 0 deletions resources/scripts/article-extractor/extract-article.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { extract } from '@extractus/article-extractor'

const input = process.argv[2];

try {
const article = await extract(input);
console.log(JSON.stringify(article));
}
catch (err) {
console.error(err);
}
2 changes: 2 additions & 0 deletions src/librssguard/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ set(SOURCES
network-web/adblock/adblockrequestinfo.h
network-web/apiserver.cpp
network-web/apiserver.h
network-web/articleparse.cpp
network-web/articleparse.h
network-web/basenetworkaccessmanager.cpp
network-web/basenetworkaccessmanager.h
network-web/cookiejar.cpp
Expand Down
36 changes: 35 additions & 1 deletion src/librssguard/gui/webbrowser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
#include "miscellaneous/application.h"
#include "miscellaneous/iconfactory.h"
#include "miscellaneous/settings.h"
#include "network-web/articleparse.h"
#include "network-web/networkfactory.h"
#include "network-web/readability.h"
#include "network-web/webfactory.h"

#include <QJsonObject>
#include <QKeyEvent>
#include <QProgressBar>
#include <QScrollBar>
Expand All @@ -37,7 +39,10 @@ WebBrowser::WebBrowser(WebViewer* viewer, QWidget* parent)
#endif
m_actionReadabilePage(new QAction(qApp->icons()->fromTheme(QSL("text-html")),
tr("View website in reader mode"),
this)) {
this)),
m_actionGetFullArticle(new QAction(qApp->icons()->fromTheme(QSL("download"), QSL("browser-download")),
tr("Load full source article"),
this)) {
if (m_webView == nullptr) {
m_webView = qApp->createWebView();
dynamic_cast<QWidget*>(m_webView)->setParent(this);
Expand Down Expand Up @@ -87,6 +92,7 @@ void WebBrowser::createConnections() {

connect(m_actionOpenInSystemBrowser, &QAction::triggered, this, &WebBrowser::openCurrentSiteInSystemBrowser);
connect(m_actionReadabilePage, &QAction::triggered, this, &WebBrowser::readabilePage);
connect(m_actionGetFullArticle, &QAction::triggered, this, &WebBrowser::getFullArticle);

#if defined(ENABLE_MEDIAPLAYER)
connect(m_actionPlayPageInMediaPlayer, &QAction::triggered, this, &WebBrowser::playCurrentSiteInMediaPlayer);
Expand All @@ -99,6 +105,9 @@ void WebBrowser::createConnections() {

connect(qApp->web()->readability(), &Readability::htmlReadabled, this, &WebBrowser::setReadabledHtml);
connect(qApp->web()->readability(), &Readability::errorOnHtmlReadabiliting, this, &WebBrowser::readabilityFailed);

connect(qApp->web()->articleParse(), &ArticleParse::articleParsed, this, &WebBrowser::setFullArticleHtml);
connect(qApp->web()->articleParse(), &ArticleParse::errorOnArticlePArsing, this, &WebBrowser::fullArticleFailed);
}

void WebBrowser::updateUrl(const QUrl& url) {
Expand Down Expand Up @@ -182,6 +191,11 @@ void WebBrowser::readabilePage() {
qApp->web()->readability()->makeHtmlReadable(this, m_webView->html(), m_webView->url().toString());
}

void WebBrowser::getFullArticle() {
m_actionGetFullArticle->setEnabled(false);
qApp->web()->articleParse()->parseArticle(this, m_webView->url().toString());
}

bool WebBrowser::eventFilter(QObject* watched, QEvent* event) {
Q_UNUSED(watched)

Expand Down Expand Up @@ -292,6 +306,21 @@ void WebBrowser::readabilityFailed(QObject* sndr, const QString& error) {
}
}

void WebBrowser::setFullArticleHtml(QObject* sndr, const QString& json_answer) {
if (sndr == this && !json_answer.isEmpty()) {
QJsonDocument json_doc = QJsonDocument::fromJson(json_answer.toUtf8());
QString better_html = json_doc["content"].toString();

m_webView->setReadabledHtml(better_html, m_webView->url());
}
}

void WebBrowser::fullArticleFailed(QObject* sndr, const QString& error) {
if (sndr == this && !error.isEmpty()) {
m_webView->setReadabledHtml(error, m_webView->url());
}
}

void WebBrowser::initializeLayout() {
m_toolBar->setFloatable(false);
m_toolBar->setMovable(false);
Expand Down Expand Up @@ -324,9 +353,11 @@ void WebBrowser::initializeLayout() {

m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
m_actionGetFullArticle->setEnabled(false);

// Add needed actions into toolbar.
m_toolBar->addAction(m_actionOpenInSystemBrowser);
m_toolBar->addAction(m_actionGetFullArticle);
m_toolBar->addAction(m_actionReadabilePage);

#if defined(ENABLE_MEDIAPLAYER)
Expand Down Expand Up @@ -358,6 +389,7 @@ void WebBrowser::onLoadingStarted() {
m_loadingProgress->show();
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
m_actionGetFullArticle->setEnabled(false);

#if defined(ENABLE_MEDIAPLAYER)
m_actionPlayPageInMediaPlayer->setEnabled(false);
Expand All @@ -375,6 +407,7 @@ void WebBrowser::onLoadingFinished(bool success) {

if (url.isValid() && !url.host().isEmpty()) {
m_actionOpenInSystemBrowser->setEnabled(true);
m_actionGetFullArticle->setEnabled(true);
m_actionReadabilePage->setEnabled(true);

#if defined(ENABLE_MEDIAPLAYER)
Expand All @@ -384,6 +417,7 @@ void WebBrowser::onLoadingFinished(bool success) {
else {
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
m_actionGetFullArticle->setEnabled(false);

#if defined(ENABLE_MEDIAPLAYER)
m_actionPlayPageInMediaPlayer->setEnabled(false);
Expand Down
6 changes: 6 additions & 0 deletions src/librssguard/gui/webbrowser.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,14 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent {
void newWindowRequested(WebViewer* viewer);

void readabilePage();
void getFullArticle();

void setReadabledHtml(QObject* sndr, const QString& better_html);
void readabilityFailed(QObject* sndr, const QString& error);

void setFullArticleHtml(QObject* sndr, const QString& json_answer);
void fullArticleFailed(QObject* sndr, const QString& error);

signals:
void windowCloseRequested();
void iconChanged(int index, const QIcon& icon);
Expand Down Expand Up @@ -106,6 +111,7 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent {
#endif

QAction* m_actionReadabilePage;
QAction* m_actionGetFullArticle;

QList<Message> m_messages;
QPointer<RootItem> m_root;
Expand Down
140 changes: 140 additions & 0 deletions src/librssguard/network-web/articleparse.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// For license of this file, see <project-root-folder>/LICENSE.md.

#include "network-web/articleparse.h"

#include "3rd-party/boolinq/boolinq.h"
#include "exceptions/applicationexception.h"
#include "miscellaneous/application.h"

#include <QDir>

#define EXTRACTOR_PACKAGE "@extractus/article-extractor"
#define EXTRACTOR_VERSION "8.0.7"

ArticleParse::ArticleParse(QObject* parent) : QObject{parent}, m_modulesInstalling(false), m_modulesInstalled(false) {
connect(qApp->nodejs(), &NodeJs::packageInstalledUpdated, this, &ArticleParse::onPackageReady);
connect(qApp->nodejs(), &NodeJs::packageError, this, &ArticleParse::onPackageError);
}

void ArticleParse::onPackageReady(const QList<NodeJs::PackageMetadata>& pkgs, bool already_up_to_date) {
Q_UNUSED(already_up_to_date)

bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) {
return pkg.m_name == QSL(EXTRACTOR_PACKAGE);
});

if (!concerns_extractor) {
return;
}

m_modulesInstalled = true;
m_modulesInstalling = false;

qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Packages for article-extractor are installed"),
tr("Press the button once more!"),
QSystemTrayIcon::MessageIcon::Information},
{true, true, false});

// Emit this just to allow readability again for user.
emit articleParsed(nullptr, tr("Packages for article-extractor are installed. You can now use this feature!"));
}

void ArticleParse::onPackageError(const QList<NodeJs::PackageMetadata>& pkgs, const QString& error) {
bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) {
return pkg.m_name == QSL(EXTRACTOR_PACKAGE);
});

if (!concerns_extractor) {
return;
}

m_modulesInstalled = m_modulesInstalling = false;

qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Packages for article-extractor are NOT installed"),
tr("There is error: %1").arg(error),
QSystemTrayIcon::MessageIcon::Critical},
{true, true, false});

// Emit this just to allow readability again for user.
emit articleParsed(nullptr, tr("Packages for article-extractor are NOT installed. There is error: %1").arg(error));
}

void ArticleParse::parseArticle(QObject* sndr, const QString& url) {
if (!m_modulesInstalled) {
// NOTE: Here we use MJS file directly placed in its NODE package folder
// because NODE_PATH is not supported for MJS files.
m_scriptFilename = qApp->nodejs()->processedPackageFolder() + QDir::separator() + QSL("extract-article.mjs");

if (!IOFactory::copyFile(QSL(":/scripts/article-extractor/extract-article.mjs"), m_scriptFilename)) {
qCriticalNN << LOGSEC_ADBLOCK << "Failed to copy article-extractor script to TEMP.";
}

try {
NodeJs::PackageStatus st_extractor =
qApp->nodejs()->packageStatus({QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)});

if (st_extractor != NodeJs::PackageStatus::UpToDate) {
if (!m_modulesInstalling) {
// We make sure to update modules.
m_modulesInstalling = true;

qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Node.js libraries not installed"),
tr("%1 will now install some needed libraries, this will take only a few seconds. "
"You will be notified when installation is complete.")
.arg(QSL(APP_NAME)),
QSystemTrayIcon::MessageIcon::Warning},
{true, true, false});
qApp->nodejs()->installUpdatePackages({{QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)}});
}

return;
}
else {
m_modulesInstalled = true;
}
}
catch (const ApplicationException& ex) {
qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Node.js libraries not installed"),
tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check "
"if your Node.js is properly configured."),
QSystemTrayIcon::MessageIcon::Critical},
{true, true, false});

qCriticalNN << LOGSEC_CORE << "Failed to check for Node.js package status:" << QUOTE_W_SPACE_DOT(ex.message());

// Emit this just to allow readability again for user.
emit articleParsed(sndr,
tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check "
"if your Node.js is properly configured."));
}
}

QProcess* proc = new QProcess(this);

connect(proc,
QOverload<int, QProcess::ExitStatus>::of(&QProcess::finished),
this,
[=](int exit_code, QProcess::ExitStatus exit_status) {
onParsingFinished(sndr, exit_code, exit_status);
});

qApp->nodejs()->runScript(proc, m_scriptFilename, {url});
}

void ArticleParse::onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status) {
QProcess* proc = qobject_cast<QProcess*>(sender());

if (exit_status == QProcess::ExitStatus::NormalExit && exit_code == EXIT_SUCCESS) {
emit articleParsed(sndr, QString::fromUtf8(proc->readAllStandardOutput()));
}
else {
QString err = QString::fromUtf8(proc->readAllStandardError());
emit errorOnArticlePArsing(sndr, err);
}

proc->deleteLater();
}
34 changes: 34 additions & 0 deletions src/librssguard/network-web/articleparse.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// For license of this file, see <project-root-folder>/LICENSE.md.

#ifndef ARTICLEPARSE_H
#define ARTICLEPARSE_H

#include "miscellaneous/nodejs.h"

#include <QObject>
#include <QProcess>

class ArticleParse : public QObject {
Q_OBJECT

public:
explicit ArticleParse(QObject* parent = nullptr);

void parseArticle(QObject* sndr, const QString& url);

private slots:
void onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status);
void onPackageReady(const QList<NodeJs::PackageMetadata>& pkgs, bool already_up_to_date);
void onPackageError(const QList<NodeJs::PackageMetadata>& pkgs, const QString& error);

signals:
void articleParsed(QObject* sndr, const QString& better_html);
void errorOnArticlePArsing(QObject* sndr, const QString& error);

private:
bool m_modulesInstalling;
bool m_modulesInstalled;
QString m_scriptFilename;
};

#endif // ARTICLEPARSE_H
6 changes: 6 additions & 0 deletions src/librssguard/network-web/webfactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "miscellaneous/settings.h"
#include "network-web/adblock/adblockmanager.h"
#include "network-web/apiserver.h"
#include "network-web/articleparse.h"
#include "network-web/cookiejar.h"
#include "network-web/readability.h"

Expand Down Expand Up @@ -54,6 +55,7 @@ WebFactory::WebFactory(QObject* parent) : QObject(parent), m_apiServer(nullptr),

m_cookieJar = new CookieJar(this);
m_readability = new Readability(this);
m_articleParse = new ArticleParse(this);

#if defined(NO_LITE)
#if QT_VERSION >= 0x050D00 // Qt >= 5.13.0
Expand Down Expand Up @@ -568,6 +570,10 @@ Readability* WebFactory::readability() const {
return m_readability;
}

ArticleParse* WebFactory::articleParse() const {
return m_articleParse;
}

void WebFactory::startApiServer() {
m_apiServer = new ApiServer(this);
m_apiServer->setListenAddressPort(QSL("http://localhost:54123"), true);
Expand Down
3 changes: 3 additions & 0 deletions src/librssguard/network-web/webfactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class AdBlockManager;
class CookieJar;
class ApiServer;
class Readability;
class ArticleParse;

class RSSGUARD_DLLSPEC WebFactory : public QObject {
Q_OBJECT
Expand Down Expand Up @@ -50,6 +51,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject {

CookieJar* cookieJar() const;
Readability* readability() const;
ArticleParse* articleParse() const;

void startApiServer();
void stopApiServer();
Expand Down Expand Up @@ -95,6 +97,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject {
ApiServer* m_apiServer;
CookieJar* m_cookieJar;
Readability* m_readability;
ArticleParse* m_articleParse;
QString m_customUserAgent;
};

Expand Down

0 comments on commit c7c069d

Please sign in to comment.