Browse Source

remove image scraping for facebook pages

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
093ee2ef74
  1. 1
      package.json
  2. 7
      scrape.sh
  3. 127
      src/logic.js
  4. 27
      src/scrape.js
  5. 53
      yarn.lock

1
package.json

@ -1,7 +1,6 @@
{
"license": "UNLICENSED",
"dependencies": {
"gm": "^1.23.1",
"minimist": "^1.2.5",
"puppeteer": "^3.0.2",
"ramda": "^0.27.0"

7
scrape.sh

@ -9,18 +9,13 @@ function usage {
OPTIONS:
-h --help -? print usage
--events File in JSON format that contains an array
of prevously parsed events. This option
will disable image scraping of previously
scraped events.
of prevously parsed events.
-p --page Facebook page id. Scrape all events of a
specific facebook page.
--pages List of Facebook page ids. See examples for
format.
-o --output Output events into this path instead of
stdout.
-i --images (experimental) Scrape event images.
--image-directory Default: './img'. Set directory for saving
event images.
--skip-upcoming-events Default: The scraper will automatically
scrape upcoming events, with this option
enabled they will be skipped.

127
src/logic.js

@ -1,16 +1,7 @@
const {
eqBy,
hasPath,
maxBy,
pathOr,
prop,
props,
unionWith,
} = require('ramda');
const { hasPath, pathOr, props } = require('ramda');
const parseArgs = require('minimist');
const process = require('process');
const event_url = (event_id) => `https://www.facebook.com/events/${event_id}`;
const page_url = (page_id) => `https://www.facebook.com/${page_id}`;
const page_events_url = (page_id) => page_url(page_id) + '/events/';
const { graphql_endpoint } = require('./constants');
@ -18,10 +9,6 @@ const { graphql_endpoint } = require('./constants');
const fs = require('fs').promises;
const filesystem = require('fs');
const path = require('path');
const gm = require('gm').subClass({ imageMagick: true });
const puppeteer = require('puppeteer');
const flatten_string = (page_id) => {
@ -70,10 +57,6 @@ const parse_args = (args) => {
}
const output = parse_output(argv);
const images = pathOr(false, ['images'], argv) || pathOr(false, ['i'], argv);
const image_directory = flatten_string(
pathOr('./img', ['image-directory'], argv),
);
const get_upcoming_events = !pathOr(false, ['skip-upcoming-events'], argv);
const get_past_events = pathOr(false, ['past-events'], argv);
@ -87,8 +70,6 @@ const parse_args = (args) => {
],
events,
output,
images,
image_directory,
get_upcoming_events,
get_past_events,
headless,
@ -112,85 +93,12 @@ const merge_edges = (acc, current) => {
];
};
const write_image = (path, image) =>
fs.writeFile(path, image, { encoding: null });
const gm_write = (image, path) => {
return new Promise((resolve, reject) =>
image.write(path, (err) => (!err ? resolve() : reject(err))),
);
};
const write_resized = async (image_path, original) => {
const image = gm(original);
const size = await new Promise((resolve) => {
image.size((err, value) => (!err ? resolve(value) : resolve(null)));
});
if (size === null) {
throw new Error('Could not get image.');
}
let { height: y, width: x } = size;
if (y % 2 === 1) {
y = y + 1;
}
if (x % 2 === 1) {
x = x + 1;
}
image.resize(x, y);
if (y > x) {
const z = (y - x) / 2;
image.crop(x, x, 0, z);
}
if (y < x) {
const z = (x - y) / 2;
image.crop(y, y, z, 0);
}
return gm_write(image, image_path);
};
const save_images = async (image = null, event_id, image_directory) => {
if (image === null) {
return [];
}
const original_path = `${image_directory}/${event_id}.jpg`;
const resized_path = `${image_directory}/${event_id}-square.jpg`;
const original = write_image(original_path, image);
const resized_square = write_resized(resized_path, image);
try {
await Promise.all([original, resized_square]);
return { original: original_path, square: resized_path };
} catch (err) {
console.error(err);
return { original: null };
}
};
const get_city_name = (event) =>
pathOr('', 'event_place.city.contextual_name'.split('.'), event);
const get_event_host = (event) =>
pathOr('', 'event_place.contextual_name'.split('.'), event);
const create_images_directory = (images_directory) => {
if (images_directory === null || images_directory === undefined) {
return Promise.reject('Image path was not set');
}
if (!filesystem.existsSync(images_directory)) {
return fs.mkdir(images_directory, { recursive: true }).catch(console.error);
}
return Promise.resolve();
};
const read_previous_events = (path) => {
if (path !== null) {
if (filesystem.existsSync(path)) {
@ -206,36 +114,6 @@ const read_previous_events = (path) => {
return Promise.resolve([]);
};
const load_event = async (page, event_id) => {
try {
const image_data = new Promise((resolve) => {
const images = [];
page.on('response', async (response) => {
const response_url = response.request().url();
const { pathname } = new URL(response_url);
const ext = path.extname(pathname);
if (ext === '.jpg') {
const image = await response.buffer();
images.push(image);
}
});
page.on('domcontentloaded', async () => {
resolve(images);
});
});
await page.goto(event_url(event_id));
const images = await image_data;
const image = images.reduce((res, image) =>
maxBy((item) => item.length, res, image),
);
return { image };
} catch (e) {
console.error(e);
return {};
}
};
const map_event = ({ node: event }) => {
const ticket_url = pathOr('', ['event_buy_ticket_url'], event);
const city = get_city_name(event);
@ -391,12 +269,9 @@ const get_page_events = async (
};
module.exports = {
create_images_directory,
get_page_events,
load_event,
merge_edges,
open_browser,
parse_args,
read_previous_events,
save_images,
};

27
src/scrape.js

@ -1,32 +1,23 @@
const { pathOr, uniqBy, eqBy, prop, union } = require('ramda');
const {
create_images_directory,
get_page_events,
merge_edges,
open_browser,
parse_args,
read_previous_events,
merge_edges,
load_event,
save_images,
} = require('./logic');
const {
events: event_file,
get_past_events,
get_upcoming_events,
image_directory,
images,
output,
page_ids,
headless,
} = parse_args(process.argv.slice(2));
(async () => {
if (images) {
create_images_directory(image_directory);
}
let events = [];
const browser = await open_browser({ headless });
@ -43,22 +34,6 @@ const {
} catch (e) {
console.error(e);
}
if (images) {
page_events = await Promise.all(
page_events.map(async (event) => {
const event_page = await browser.newPage();
const { image } = await load_event(event_page, event.event_id);
await event_page.close();
const images = await save_images(
image,
event.event_id,
image_directory,
);
return { images, ...event };
}),
);
}
events = uniqBy(eqBy(prop('event_id')))(union(events, page_events));
}

53
yarn.lock

@ -1254,16 +1254,6 @@ arr-union@^3.1.0:
resolved "https://registry.yarnpkg.com/arr-union/-/arr-union-3.1.0.tgz#e39b09aea9def866a8f206e288af63919bae39c4"
integrity sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ=
array-parallel@~0.1.3:
version "0.1.3"
resolved "https://registry.yarnpkg.com/array-parallel/-/array-parallel-0.1.3.tgz#8f785308926ed5aa478c47e64d1b334b6c0c947d"
integrity sha1-j3hTCJJu1apHjEfmTRszS2wMlH0=
array-series@~0.1.5:
version "0.1.5"
resolved "https://registry.yarnpkg.com/array-series/-/array-series-0.1.5.tgz#df5d37bfc5c2ef0755e2aa4f92feae7d4b5a972f"
integrity sha1-3103v8XC7wdV4qpPkv6ufUtaly8=
array-unique@^0.3.2:
version "0.3.2"
resolved "https://registry.yarnpkg.com/array-unique/-/array-unique-0.3.2.tgz#a894b75d4bc4f6cd679ef3244a9fd8f46ae2d428"
@ -1668,14 +1658,6 @@ core-util-is@1.0.2:
resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7"
integrity sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=
cross-spawn@^4.0.0:
version "4.0.2"
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-4.0.2.tgz#7b9247621c23adfdd3856004a823cbe397424d41"
integrity sha1-e5JHYhwjrf3ThWAEqCPL45dCTUE=
dependencies:
lru-cache "^4.0.1"
which "^1.2.9"
cross-spawn@^6.0.0:
version "6.0.5"
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4"
@ -1743,13 +1725,6 @@ debug@^2.2.0, debug@^2.3.3:
dependencies:
ms "2.0.0"
debug@^3.1.0:
version "3.2.6"
resolved "https://registry.yarnpkg.com/debug/-/debug-3.2.6.tgz#e83d17de16d8a7efb7717edbe5fb10135eee629b"
integrity sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==
dependencies:
ms "^2.1.1"
decamelize@^1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
@ -2339,16 +2314,6 @@ globals@^12.1.0:
dependencies:
type-fest "^0.8.1"
gm@^1.23.1:
version "1.23.1"
resolved "https://registry.yarnpkg.com/gm/-/gm-1.23.1.tgz#2edeeb958084d0f8ea7988e5d995b1c7dfc14777"
integrity sha1-Lt7rlYCE0PjqeYjl2ZWxx9/BR3c=
dependencies:
array-parallel "~0.1.3"
array-series "~0.1.5"
cross-spawn "^4.0.0"
debug "^3.1.0"
graceful-fs@^4.2.4:
version "4.2.4"
resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.4.tgz#2256bde14d3632958c465ebc96dc467ca07a29fb"
@ -3307,14 +3272,6 @@ loose-envify@^1.0.0:
dependencies:
js-tokens "^3.0.0 || ^4.0.0"
lru-cache@^4.0.1:
version "4.1.5"
resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-4.1.5.tgz#8bbe50ea85bed59bc9e33dcab8235ee9bcf443cd"
integrity sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g==
dependencies:
pseudomap "^1.0.2"
yallist "^2.1.2"
make-dir@^3.0.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-3.1.0.tgz#415e967046b3a7f1d185277d84aa58203726a13f"
@ -3802,11 +3759,6 @@ proxy-from-env@^1.0.0:
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2"
integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==
pseudomap@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/pseudomap/-/pseudomap-1.0.2.tgz#f052a28da70e618917ef0a8ac34c1ae5a68286b3"
integrity sha1-8FKijacOYYkX7wqKw0wa5aaChrM=
psl@^1.1.28:
version "1.8.0"
resolved "https://registry.yarnpkg.com/psl/-/psl-1.8.0.tgz#9326f8bcfb013adcc005fdff056acce020e51c24"
@ -4862,11 +4814,6 @@ y18n@^4.0.0:
resolved "https://registry.yarnpkg.com/y18n/-/y18n-4.0.0.tgz#95ef94f85ecc81d007c264e190a120f0a3c8566b"
integrity sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==
yallist@^2.1.2:
version "2.1.2"
resolved "https://registry.yarnpkg.com/yallist/-/yallist-2.1.2.tgz#1c11f9218f076089a47dd512f93c6699a6a81d52"
integrity sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=
yargs-parser@^18.1.1:
version "18.1.3"
resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.3.tgz#be68c4975c6b2abf469236b0c870362fab09a7b0"

Loading…
Cancel
Save