From 093ee2ef7426efd906cc07094ecdd5443704c9d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Sun, 8 Nov 2020 01:12:59 +0100 Subject: [PATCH] remove image scraping for facebook pages --- package.json | 1 - scrape.sh | 7 +-- src/logic.js | 127 +------------------------------------------------- src/scrape.js | 27 +---------- yarn.lock | 53 --------------------- 5 files changed, 3 insertions(+), 212 deletions(-) diff --git a/package.json b/package.json index 7f3a1d2..a72b55b 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,6 @@ { "license": "UNLICENSED", "dependencies": { - "gm": "^1.23.1", "minimist": "^1.2.5", "puppeteer": "^3.0.2", "ramda": "^0.27.0" diff --git a/scrape.sh b/scrape.sh index 2142a69..4329738 100755 --- a/scrape.sh +++ b/scrape.sh @@ -9,18 +9,13 @@ function usage { OPTIONS: -h --help -? print usage --events File in JSON format that contains an array - of prevously parsed events. This option - will disable image scraping of previously - scraped events. + of prevously parsed events. -p --page Facebook page id. Scrape all events of a specific facebook page. --pages List of Facebook page ids. See examples for format. -o --output Output events into this path instead of stdout. - -i --images (experimental) Scrape event images. - --image-directory Default: './img'. Set directory for saving - event images. --skip-upcoming-events Default: The scraper will automatically scrape upcoming events, with this option enabled they will be skipped. diff --git a/src/logic.js b/src/logic.js index 79c5ef4..f744810 100644 --- a/src/logic.js +++ b/src/logic.js @@ -1,16 +1,7 @@ -const { - eqBy, - hasPath, - maxBy, - pathOr, - prop, - props, - unionWith, -} = require('ramda'); +const { hasPath, pathOr, props } = require('ramda'); const parseArgs = require('minimist'); const process = require('process'); -const event_url = (event_id) => `https://www.facebook.com/events/${event_id}`; const page_url = (page_id) => `https://www.facebook.com/${page_id}`; const page_events_url = (page_id) => page_url(page_id) + '/events/'; const { graphql_endpoint } = require('./constants'); @@ -18,10 +9,6 @@ const { graphql_endpoint } = require('./constants'); const fs = require('fs').promises; const filesystem = require('fs'); -const path = require('path'); - -const gm = require('gm').subClass({ imageMagick: true }); - const puppeteer = require('puppeteer'); const flatten_string = (page_id) => { @@ -70,10 +57,6 @@ const parse_args = (args) => { } const output = parse_output(argv); - const images = pathOr(false, ['images'], argv) || pathOr(false, ['i'], argv); - const image_directory = flatten_string( - pathOr('./img', ['image-directory'], argv), - ); const get_upcoming_events = !pathOr(false, ['skip-upcoming-events'], argv); const get_past_events = pathOr(false, ['past-events'], argv); @@ -87,8 +70,6 @@ const parse_args = (args) => { ], events, output, - images, - image_directory, get_upcoming_events, get_past_events, headless, @@ -112,85 +93,12 @@ const merge_edges = (acc, current) => { ]; }; -const write_image = (path, image) => - fs.writeFile(path, image, { encoding: null }); - -const gm_write = (image, path) => { - return new Promise((resolve, reject) => - image.write(path, (err) => (!err ? resolve() : reject(err))), - ); -}; - -const write_resized = async (image_path, original) => { - const image = gm(original); - const size = await new Promise((resolve) => { - image.size((err, value) => (!err ? resolve(value) : resolve(null))); - }); - - if (size === null) { - throw new Error('Could not get image.'); - } - - let { height: y, width: x } = size; - - if (y % 2 === 1) { - y = y + 1; - } - - if (x % 2 === 1) { - x = x + 1; - } - - image.resize(x, y); - - if (y > x) { - const z = (y - x) / 2; - image.crop(x, x, 0, z); - } - - if (y < x) { - const z = (x - y) / 2; - image.crop(y, y, z, 0); - } - - return gm_write(image, image_path); -}; - -const save_images = async (image = null, event_id, image_directory) => { - if (image === null) { - return []; - } - const original_path = `${image_directory}/${event_id}.jpg`; - const resized_path = `${image_directory}/${event_id}-square.jpg`; - const original = write_image(original_path, image); - const resized_square = write_resized(resized_path, image); - try { - await Promise.all([original, resized_square]); - return { original: original_path, square: resized_path }; - } catch (err) { - console.error(err); - return { original: null }; - } -}; - const get_city_name = (event) => pathOr('', 'event_place.city.contextual_name'.split('.'), event); const get_event_host = (event) => pathOr('', 'event_place.contextual_name'.split('.'), event); -const create_images_directory = (images_directory) => { - if (images_directory === null || images_directory === undefined) { - return Promise.reject('Image path was not set'); - } - - if (!filesystem.existsSync(images_directory)) { - return fs.mkdir(images_directory, { recursive: true }).catch(console.error); - } - - return Promise.resolve(); -}; - const read_previous_events = (path) => { if (path !== null) { if (filesystem.existsSync(path)) { @@ -206,36 +114,6 @@ const read_previous_events = (path) => { return Promise.resolve([]); }; -const load_event = async (page, event_id) => { - try { - const image_data = new Promise((resolve) => { - const images = []; - page.on('response', async (response) => { - const response_url = response.request().url(); - const { pathname } = new URL(response_url); - const ext = path.extname(pathname); - if (ext === '.jpg') { - const image = await response.buffer(); - images.push(image); - } - }); - page.on('domcontentloaded', async () => { - resolve(images); - }); - }); - - await page.goto(event_url(event_id)); - const images = await image_data; - const image = images.reduce((res, image) => - maxBy((item) => item.length, res, image), - ); - return { image }; - } catch (e) { - console.error(e); - return {}; - } -}; - const map_event = ({ node: event }) => { const ticket_url = pathOr('', ['event_buy_ticket_url'], event); const city = get_city_name(event); @@ -391,12 +269,9 @@ const get_page_events = async ( }; module.exports = { - create_images_directory, get_page_events, - load_event, merge_edges, open_browser, parse_args, read_previous_events, - save_images, }; diff --git a/src/scrape.js b/src/scrape.js index 29a5473..aa1f9d2 100644 --- a/src/scrape.js +++ b/src/scrape.js @@ -1,32 +1,23 @@ const { pathOr, uniqBy, eqBy, prop, union } = require('ramda'); const { - create_images_directory, get_page_events, + merge_edges, open_browser, parse_args, read_previous_events, - merge_edges, - load_event, - save_images, } = require('./logic'); const { events: event_file, get_past_events, get_upcoming_events, - image_directory, - images, output, page_ids, headless, } = parse_args(process.argv.slice(2)); (async () => { - if (images) { - create_images_directory(image_directory); - } - let events = []; const browser = await open_browser({ headless }); @@ -43,22 +34,6 @@ const { } catch (e) { console.error(e); } - if (images) { - page_events = await Promise.all( - page_events.map(async (event) => { - const event_page = await browser.newPage(); - const { image } = await load_event(event_page, event.event_id); - await event_page.close(); - const images = await save_images( - image, - event.event_id, - image_directory, - ); - return { images, ...event }; - }), - ); - } - events = uniqBy(eqBy(prop('event_id')))(union(events, page_events)); } diff --git a/yarn.lock b/yarn.lock index 1c77408..5161a0b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1254,16 +1254,6 @@ arr-union@^3.1.0: resolved "https://registry.yarnpkg.com/arr-union/-/arr-union-3.1.0.tgz#e39b09aea9def866a8f206e288af63919bae39c4" integrity sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ= -array-parallel@~0.1.3: - version "0.1.3" - resolved "https://registry.yarnpkg.com/array-parallel/-/array-parallel-0.1.3.tgz#8f785308926ed5aa478c47e64d1b334b6c0c947d" - integrity sha1-j3hTCJJu1apHjEfmTRszS2wMlH0= - -array-series@~0.1.5: - version "0.1.5" - resolved "https://registry.yarnpkg.com/array-series/-/array-series-0.1.5.tgz#df5d37bfc5c2ef0755e2aa4f92feae7d4b5a972f" - integrity sha1-3103v8XC7wdV4qpPkv6ufUtaly8= - array-unique@^0.3.2: version "0.3.2" resolved "https://registry.yarnpkg.com/array-unique/-/array-unique-0.3.2.tgz#a894b75d4bc4f6cd679ef3244a9fd8f46ae2d428" @@ -1668,14 +1658,6 @@ core-util-is@1.0.2: resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7" integrity sha1-tf1UIgqivFq1eqtxQMlAdUUDwac= -cross-spawn@^4.0.0: - version "4.0.2" - resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-4.0.2.tgz#7b9247621c23adfdd3856004a823cbe397424d41" - integrity sha1-e5JHYhwjrf3ThWAEqCPL45dCTUE= - dependencies: - lru-cache "^4.0.1" - which "^1.2.9" - cross-spawn@^6.0.0: version "6.0.5" resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4" @@ -1743,13 +1725,6 @@ debug@^2.2.0, debug@^2.3.3: dependencies: ms "2.0.0" -debug@^3.1.0: - version "3.2.6" - resolved "https://registry.yarnpkg.com/debug/-/debug-3.2.6.tgz#e83d17de16d8a7efb7717edbe5fb10135eee629b" - integrity sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ== - dependencies: - ms "^2.1.1" - decamelize@^1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290" @@ -2339,16 +2314,6 @@ globals@^12.1.0: dependencies: type-fest "^0.8.1" -gm@^1.23.1: - version "1.23.1" - resolved "https://registry.yarnpkg.com/gm/-/gm-1.23.1.tgz#2edeeb958084d0f8ea7988e5d995b1c7dfc14777" - integrity sha1-Lt7rlYCE0PjqeYjl2ZWxx9/BR3c= - dependencies: - array-parallel "~0.1.3" - array-series "~0.1.5" - cross-spawn "^4.0.0" - debug "^3.1.0" - graceful-fs@^4.2.4: version "4.2.4" resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.4.tgz#2256bde14d3632958c465ebc96dc467ca07a29fb" @@ -3307,14 +3272,6 @@ loose-envify@^1.0.0: dependencies: js-tokens "^3.0.0 || ^4.0.0" -lru-cache@^4.0.1: - version "4.1.5" - resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-4.1.5.tgz#8bbe50ea85bed59bc9e33dcab8235ee9bcf443cd" - integrity sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g== - dependencies: - pseudomap "^1.0.2" - yallist "^2.1.2" - make-dir@^3.0.0: version "3.1.0" resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-3.1.0.tgz#415e967046b3a7f1d185277d84aa58203726a13f" @@ -3802,11 +3759,6 @@ proxy-from-env@^1.0.0: resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2" integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg== -pseudomap@^1.0.2: - version "1.0.2" - resolved "https://registry.yarnpkg.com/pseudomap/-/pseudomap-1.0.2.tgz#f052a28da70e618917ef0a8ac34c1ae5a68286b3" - integrity sha1-8FKijacOYYkX7wqKw0wa5aaChrM= - psl@^1.1.28: version "1.8.0" resolved "https://registry.yarnpkg.com/psl/-/psl-1.8.0.tgz#9326f8bcfb013adcc005fdff056acce020e51c24" @@ -4862,11 +4814,6 @@ y18n@^4.0.0: resolved "https://registry.yarnpkg.com/y18n/-/y18n-4.0.0.tgz#95ef94f85ecc81d007c264e190a120f0a3c8566b" integrity sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w== -yallist@^2.1.2: - version "2.1.2" - resolved "https://registry.yarnpkg.com/yallist/-/yallist-2.1.2.tgz#1c11f9218f076089a47dd512f93c6699a6a81d52" - integrity sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI= - yargs-parser@^18.1.1: version "18.1.3" resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.3.tgz#be68c4975c6b2abf469236b0c870362fab09a7b0"