2022-06-02 17:42:13 -03:00
|
|
|
/*****************************************************************************\
|
|
|
|
| ( )_ _ |
|
|
|
|
| _ _ _ __ _ _ __ ___ ___ _ _ | ,_)(_) ___ ___ _ |
|
|
|
|
| ( '_`\ ( '__)/'_` ) /'_ `\/' _ ` _ `\ /'_` )| | | |/',__)/' v `\ /'_`\ |
|
|
|
|
| | (_) )| | ( (_| |( (_) || ( ) ( ) |( (_| || |_ | |\__, \| (˅) |( (_) ) |
|
|
|
|
| | ,__/'(_) `\__,_)`\__ |(_) (_) (_)`\__,_)`\__)(_)(____/(_) (_)`\___/' |
|
|
|
|
| | | ( )_) | |
|
|
|
|
| (_) \___/' |
|
|
|
|
| |
|
|
|
|
| General Bots Copyright (c) Pragmatismo.io. All rights reserved. |
|
|
|
|
| Licensed under the AGPL-3.0. |
|
|
|
|
| |
|
|
|
|
| According to our dual licensing model, this program can be used either |
|
|
|
|
| under the terms of the GNU Affero General Public License, version 3, |
|
|
|
|
| or under a proprietary license. |
|
|
|
|
| |
|
|
|
|
| The texts of the GNU Affero General Public License with an additional |
|
|
|
|
| permission and of our proprietary license can be found at and |
|
|
|
|
| in the LICENSE file you have received along with this program. |
|
|
|
|
| |
|
|
|
|
| This program is distributed in the hope that it will be useful, |
|
|
|
|
| but WITHOUT ANY WARRANTY, without even the implied warranty of |
|
|
|
|
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
|
|
| GNU Affero General Public License for more details. |
|
|
|
|
| |
|
|
|
|
| "General Bots" is a registered trademark of Pragmatismo.io. |
|
|
|
|
| The licensing of the program under the AGPLv3 does not imply a |
|
|
|
|
| trademark license. Therefore any rights, title and interest in |
|
|
|
|
| our trademarks remain entirely with us. |
|
|
|
|
| |
|
|
|
|
\*****************************************************************************/
|
|
|
|
|
|
|
|
/**
|
2022-11-19 23:34:58 -03:00
|
|
|
* @fileoverview General Bots SSR support based on https://www.npmjs.com/package/ssr-for-bots.
|
2022-06-02 17:42:13 -03:00
|
|
|
*/
|
|
|
|
|
|
|
|
'use strict';
|
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
import Path from 'path';
|
2022-11-18 22:39:14 -03:00
|
|
|
import Fs from 'fs';
|
2022-11-19 23:34:58 -03:00
|
|
|
import { NextFunction, Request, Response } from 'express';
|
|
|
|
import urljoin from 'url-join';
|
2023-02-18 16:48:40 -03:00
|
|
|
import { GBMinInstance } from 'botlib';
|
|
|
|
import { GBServer } from '../../../src/app.js';
|
|
|
|
import { GBLogEx } from './GBLogEx.js';
|
|
|
|
import { createRequire } from 'module';
|
2023-02-20 10:29:04 -03:00
|
|
|
import urlJoin from 'url-join';
|
|
|
|
import { GBDeployer } from './GBDeployer.js';
|
|
|
|
import { GBMinService } from './GBMinService.js';
|
2023-02-18 16:48:40 -03:00
|
|
|
const require = createRequire(import.meta.url);
|
|
|
|
const puppeteer = require('puppeteer-extra');
|
|
|
|
const hidden = require('puppeteer-extra-plugin-stealth');
|
|
|
|
const { executablePath } = require('puppeteer');
|
|
|
|
|
|
|
|
export class GBSSR {
|
|
|
|
// https://hackernoon.com/tips-and-tricks-for-web-scraping-with-puppeteer-ed391a63d952
|
|
|
|
// Dont download all resources, we just need the HTML
|
|
|
|
// Also, this is huge performance/response time boost
|
2023-02-20 10:29:04 -03:00
|
|
|
private static blockedResourceTypes = [
|
|
|
|
'image',
|
|
|
|
'media',
|
|
|
|
'font',
|
|
|
|
'texttrack',
|
|
|
|
'object',
|
|
|
|
'beacon',
|
|
|
|
'csp_report',
|
|
|
|
'imageset'
|
|
|
|
];
|
2023-02-18 16:48:40 -03:00
|
|
|
|
|
|
|
// const whitelist = ["document", "script", "xhr", "fetch"];
|
2023-02-20 10:29:04 -03:00
|
|
|
private static skippedResources = [
|
2023-02-18 16:48:40 -03:00
|
|
|
'quantserve',
|
|
|
|
'adzerk',
|
|
|
|
'doubleclick',
|
|
|
|
'adition',
|
|
|
|
'exelator',
|
|
|
|
'sharethrough',
|
|
|
|
'cdn.api.twitter',
|
|
|
|
'google-analytics',
|
|
|
|
'googletagmanager',
|
|
|
|
'google',
|
|
|
|
'fontawesome',
|
|
|
|
'facebook',
|
|
|
|
'analytics',
|
|
|
|
'optimizely',
|
|
|
|
'clicktale',
|
|
|
|
'mixpanel',
|
|
|
|
'zedo',
|
|
|
|
'clicksor',
|
|
|
|
'tiqcdn'
|
2022-11-19 23:34:58 -03:00
|
|
|
];
|
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
public static async createBrowser(profilePath): Promise<any> {
|
|
|
|
let args = [
|
|
|
|
'--check-for-update-interval=2592000',
|
|
|
|
'--disable-accelerated-2d-canvas',
|
|
|
|
'--disable-dev-shm-usage',
|
|
|
|
'--disable-features=site-per-process',
|
|
|
|
'--disable-gpu',
|
|
|
|
'--no-first-run',
|
|
|
|
'--no-default-browser-check'
|
|
|
|
];
|
|
|
|
|
|
|
|
if (profilePath) {
|
|
|
|
args.push(`--user-data-dir=${profilePath}`);
|
|
|
|
|
|
|
|
const preferences = urljoin(profilePath, 'Default', 'Preferences');
|
|
|
|
if (Fs.existsSync(preferences)) {
|
|
|
|
const file = Fs.readFileSync(preferences, 'utf8');
|
|
|
|
const data = JSON.parse(file);
|
|
|
|
data['profile']['exit_type'] = 'none';
|
|
|
|
Fs.writeFileSync(preferences, JSON.stringify(data));
|
2022-11-19 23:34:58 -03:00
|
|
|
}
|
2022-06-02 17:42:13 -03:00
|
|
|
}
|
2023-02-18 16:48:40 -03:00
|
|
|
puppeteer.use(hidden());
|
|
|
|
const browser = await puppeteer.launch({
|
|
|
|
args: args,
|
|
|
|
ignoreHTTPSErrors: true,
|
|
|
|
headless: false,
|
|
|
|
defaultViewport: null,
|
|
|
|
executablePath: executablePath(),
|
|
|
|
ignoreDefaultArgs: ['--enable-automation', '--enable-blink-features=IdleDetection']
|
2022-11-19 23:34:58 -03:00
|
|
|
});
|
2023-02-18 16:48:40 -03:00
|
|
|
return browser;
|
|
|
|
}
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
/**
|
|
|
|
* Return the HTML of bot default.gbui.
|
|
|
|
*/
|
2023-02-20 10:29:04 -03:00
|
|
|
public static async getHTML(min: GBMinInstance) {
|
2023-02-18 16:48:40 -03:00
|
|
|
const url = urljoin(GBServer.globals.publicAddress, min.botId);
|
|
|
|
const browser = await GBSSR.createBrowser(null);
|
|
|
|
const stylesheetContents = {};
|
2023-02-20 10:29:04 -03:00
|
|
|
let html;
|
2023-02-18 16:48:40 -03:00
|
|
|
|
|
|
|
try {
|
|
|
|
const page = await browser.newPage();
|
|
|
|
await page.setUserAgent(
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
|
|
|
|
);
|
|
|
|
await page.setRequestInterception(true);
|
|
|
|
page.on('request', request => {
|
|
|
|
const requestUrl = request.url().split('?')[0].split('#')[0];
|
|
|
|
if (
|
2023-02-20 10:29:04 -03:00
|
|
|
GBSSR.blockedResourceTypes.indexOf(request.resourceType()) !== -1 ||
|
|
|
|
GBSSR.skippedResources.some(resource => requestUrl.indexOf(resource) !== -1)
|
2023-02-18 16:48:40 -03:00
|
|
|
) {
|
|
|
|
request.abort();
|
|
|
|
} else {
|
|
|
|
request.continue();
|
|
|
|
}
|
2022-11-19 23:34:58 -03:00
|
|
|
});
|
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
page.on('response', async resp => {
|
|
|
|
const responseUrl = resp.url();
|
|
|
|
const sameOrigin = new URL(responseUrl).origin === new URL(url).origin;
|
|
|
|
const isStylesheet = resp.request().resourceType() === 'stylesheet';
|
|
|
|
if (sameOrigin && isStylesheet) {
|
|
|
|
stylesheetContents[responseUrl] = await resp.text();
|
|
|
|
}
|
|
|
|
});
|
2022-11-19 23:34:58 -03:00
|
|
|
|
2023-02-20 10:29:04 -03:00
|
|
|
await page.setExtraHTTPHeaders({
|
|
|
|
'ngrok-skip-browser-warning': '1'
|
|
|
|
});
|
2023-02-18 16:48:40 -03:00
|
|
|
const response = await page.goto(url, {
|
|
|
|
timeout: 120000,
|
|
|
|
waitUntil: 'networkidle0'
|
2022-11-19 23:34:58 -03:00
|
|
|
});
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
const sleep = ms => {
|
|
|
|
return new Promise(resolve => {
|
|
|
|
setTimeout(resolve, ms);
|
2022-06-02 17:42:13 -03:00
|
|
|
});
|
2023-02-18 16:48:40 -03:00
|
|
|
};
|
2023-02-20 10:29:04 -03:00
|
|
|
|
|
|
|
await sleep(15000);
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
// Inject <base> on page to relative resources load properly.
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
await page.evaluate(url => {
|
|
|
|
const base = document.createElement('base');
|
|
|
|
base.href = url;
|
2023-02-20 10:29:04 -03:00
|
|
|
// Add to top of head, beeeEEEfore all other resources.
|
2023-02-18 16:48:40 -03:00
|
|
|
document.head.prepend(base);
|
|
|
|
}, url);
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
// Remove scripts and html imports. They've already executed.
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
await page.evaluate(() => {
|
|
|
|
const elements = document.querySelectorAll('script, link[rel="import"]');
|
|
|
|
elements.forEach(e => {
|
|
|
|
e.remove();
|
|
|
|
});
|
|
|
|
});
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
// Replace stylesheets in the page with their equivalent <style>.
|
|
|
|
|
|
|
|
await page.$$eval(
|
|
|
|
'link[rel="stylesheet"]',
|
|
|
|
(links, content) => {
|
|
|
|
links.forEach((link: any) => {
|
|
|
|
const cssText = content[link.href];
|
|
|
|
if (cssText) {
|
|
|
|
const style = document.createElement('style');
|
|
|
|
style.textContent = cssText;
|
|
|
|
link.replaceWith(style);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
},
|
|
|
|
stylesheetContents
|
|
|
|
);
|
|
|
|
|
2023-02-20 10:29:04 -03:00
|
|
|
html = await page.content();
|
2023-02-18 16:48:40 -03:00
|
|
|
|
|
|
|
// Close the page we opened here (not the browser).
|
|
|
|
|
|
|
|
await page.close();
|
|
|
|
} catch (e) {
|
|
|
|
const html = e.toString();
|
|
|
|
GBLogEx.error(min, `URL: ${url} Failed with message: ${html}`);
|
|
|
|
} finally {
|
|
|
|
await browser.close();
|
|
|
|
}
|
2023-02-20 10:29:04 -03:00
|
|
|
return html;
|
2022-11-19 23:34:58 -03:00
|
|
|
}
|
2023-02-18 16:48:40 -03:00
|
|
|
|
|
|
|
public static async ssrFilter(req: Request, res: Response, next) {
|
|
|
|
let applyOptions = {
|
2022-11-19 23:34:58 -03:00
|
|
|
prerender: [], // Array containing the user-agents that will trigger the ssr service
|
|
|
|
exclude: [], // Array containing paths and/or extentions that will be excluded from being prerendered by the ssr service
|
|
|
|
useCache: true, // Variable that determins if we will use page caching or not
|
|
|
|
cacheRefreshRate: 86400 // Seconds of which the cache will be kept alive, pass 0 or negative value for infinite lifespan
|
2023-02-18 16:48:40 -03:00
|
|
|
};
|
2022-11-19 23:34:58 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
// Default user agents
|
|
|
|
const prerenderArray = [
|
|
|
|
'bot',
|
|
|
|
'googlebot',
|
|
|
|
'Chrome-Lighthouse',
|
|
|
|
'DuckDuckBot',
|
|
|
|
'ia_archiver',
|
|
|
|
'bingbot',
|
|
|
|
'yandex',
|
|
|
|
'baiduspider',
|
|
|
|
'Facebot',
|
|
|
|
'facebookexternalhit',
|
|
|
|
'facebookexternalhit/1.1',
|
|
|
|
'twitterbot',
|
|
|
|
'rogerbot',
|
|
|
|
'linkedinbot',
|
|
|
|
'embedly',
|
|
|
|
'quora link preview',
|
|
|
|
'showyoubot',
|
|
|
|
'outbrain',
|
|
|
|
'pinterest',
|
|
|
|
'slackbot',
|
|
|
|
'vkShare',
|
|
|
|
'W3C_Validator'
|
|
|
|
];
|
|
|
|
|
|
|
|
// default exclude array
|
|
|
|
const excludeArray = ['.xml', '.ico', '.txt', '.json'];
|
|
|
|
const userAgent: string = req.headers['user-agent'] || '';
|
|
|
|
const prerender = new RegExp([...prerenderArray, ...applyOptions.prerender].join('|').slice(0, -1), 'i').test(
|
|
|
|
userAgent
|
|
|
|
);
|
|
|
|
const exclude = !new RegExp([...excludeArray, ...applyOptions.exclude].join('|').slice(0, -1)).test(
|
|
|
|
req.originalUrl
|
|
|
|
);
|
2022-11-19 23:34:58 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
// Reads from static HTML when a bot is crawling.
|
2022-06-02 17:42:13 -03:00
|
|
|
|
2023-02-18 16:48:40 -03:00
|
|
|
const botId = req.originalUrl ? req.originalUrl.substr(1) : GBServer.globals.minInstances[0].botId; // TODO: Get only bot.
|
2023-02-23 08:11:09 -03:00
|
|
|
let min: GBMinInstance = req.url === '/'?
|
|
|
|
GBServer.globals.minInstances[0]:
|
|
|
|
GBServer.globals.minInstances.filter(p => p.instance.botId === botId)[0];
|
|
|
|
|
2023-02-20 10:29:04 -03:00
|
|
|
if (min && req.originalUrl && prerender && exclude) {
|
|
|
|
const path = Path.join(
|
|
|
|
process.env.PWD,
|
|
|
|
'work',
|
|
|
|
`${min.instance.botId}.gbai`,
|
|
|
|
`${min.instance.botId}.gbui`,
|
|
|
|
'index.html'
|
|
|
|
);
|
2023-02-18 16:48:40 -03:00
|
|
|
const html = Fs.readFileSync(path, 'utf8');
|
|
|
|
res.status(200).send(html);
|
|
|
|
return true;
|
|
|
|
} else {
|
2023-02-23 08:11:09 -03:00
|
|
|
|
2023-02-20 10:29:04 -03:00
|
|
|
const path = Path.join(
|
|
|
|
process.env.PWD,
|
|
|
|
GBDeployer.deployFolder,
|
|
|
|
GBMinService.uiPackage,
|
|
|
|
'build',
|
2023-02-23 08:11:09 -03:00
|
|
|
min ? `index.html` : req.url
|
|
|
|
);
|
2023-02-18 16:48:40 -03:00
|
|
|
if (Fs.existsSync(path)) {
|
2023-02-23 08:11:09 -03:00
|
|
|
if (min){
|
|
|
|
let html = Fs.readFileSync(path, 'utf8');
|
|
|
|
html = html.replace(/\{botId\}/gi, min.botId);
|
|
|
|
html = html.replace(/\{theme\}/gi, min.instance.theme);
|
|
|
|
html = html.replace(/\{title\}/gi, min.instance.title);
|
|
|
|
res.send(html).end();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
res.sendFile(path);
|
|
|
|
}
|
2023-02-20 10:29:04 -03:00
|
|
|
return true;
|
2023-02-18 16:48:40 -03:00
|
|
|
} else {
|
2023-02-20 10:29:04 -03:00
|
|
|
GBLogEx.info(min, `HTTP 404: ${req.url}.`);
|
2023-02-18 16:48:40 -03:00
|
|
|
res.status(404);
|
|
|
|
res.end();
|
2023-02-20 10:29:04 -03:00
|
|
|
}
|
2023-02-18 16:48:40 -03:00
|
|
|
}
|
2022-11-19 23:34:58 -03:00
|
|
|
}
|
|
|
|
}
|