Browse Source

remove mulitple pages support

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
c9f2aed104
  1. 4
      scrape.sh
  2. 9
      src/logic.js
  3. 11
      src/scrape.js
  4. 24
      tests/parse_args.test.js

4
scrape.sh

@ -12,8 +12,6 @@ function usage {
of prevously parsed events. of prevously parsed events.
-p --page Facebook page id. Scrape all events of a -p --page Facebook page id. Scrape all events of a
specific facebook page. specific facebook page.
--pages List of Facebook page ids. See examples for
format.
-o --output Output events into this path instead of -o --output Output events into this path instead of
stdout. stdout.
--skip-upcoming-events Default: The scraper will automatically --skip-upcoming-events Default: The scraper will automatically
@ -35,7 +33,7 @@ function usage {
./scrape.sh -p livesentralen -o events.json --events=events.json ./scrape.sh -p livesentralen -o events.json --events=events.json
# You can redirect standard output into a file # You can redirect standard output into a file
./scrape.sh --pages="livesentralen,tyventrondheim" > events.json ./scrape.sh --page="tyventrondheim" > events.json
EOF EOF
} }

9
src/logic.js

@ -60,11 +60,7 @@ const parse_args = (args) => {
const headless = pathOr(true, ['headless'], argv); const headless = pathOr(true, ['headless'], argv);
return { return {
page_ids: [ page_id: [...parse_param('page'), ...parse_param('p')].pop(),
...parse_param('page'),
...parse_param('p'),
...parse_param('pages'),
],
events, events,
output, output,
get_upcoming_events, get_upcoming_events,
@ -139,8 +135,6 @@ const has_upcoming_events = (body) =>
const has_past_events = (body) => const has_past_events = (body) =>
body.includes('past events') && !body.includes('not have any past events'); body.includes('past events') && !body.includes('not have any past events');
const merge_events = (a, b) => uniqBy(eqBy(prop('event_id')))(union(a, b));
const by_date = (a, b) => { const by_date = (a, b) => {
const b_date = b.date.start; const b_date = b.date.start;
const a_date = a.date.start; const a_date = a.date.start;
@ -175,7 +169,6 @@ module.exports = {
has_past_events, has_past_events,
has_upcoming_events, has_upcoming_events,
map_event, map_event,
merge_events,
parse_args, parse_args,
read_previous_events, read_previous_events,
to_unique_events, to_unique_events,

11
src/scrape.js

@ -1,11 +1,11 @@
const { const {
by_date, by_date,
event_date_to_date_obj, event_date_to_date_obj,
merge_events,
parse_args, parse_args,
read_previous_events, read_previous_events,
to_unique_events, to_unique_events,
} = require('./logic'); } = require('./logic');
const { open_browser, get_page_events } = require('./browser'); const { open_browser, get_page_events } = require('./browser');
const { const {
@ -13,24 +13,21 @@ const {
get_past_events, get_past_events,
get_upcoming_events, get_upcoming_events,
output, output,
page_ids, page_id,
headless, headless,
} = parse_args(process.argv.slice(2)); } = parse_args(process.argv.slice(2));
(async () => { (async () => {
let events = []; let events = [];
try { try {
const browser = await open_browser({ headless }); const browser = await open_browser({ headless });
for (let page_id of page_ids) { events = await get_page_events(
let page_events = [];
page_events = await get_page_events(
browser, browser,
page_id, page_id,
get_upcoming_events, get_upcoming_events,
get_past_events, get_past_events,
); );
events = merge_events(events, page_events);
}
} catch (e) { } catch (e) {
console.error(e); console.error(e);
} }

24
tests/parse_args.test.js

@ -6,38 +6,22 @@ const process_mock = jest.spyOn(process, 'exit').mockImplementation(() => true);
describe('test parse args', () => { describe('test parse args', () => {
it('parses a single page id with -p', () => { it('parses a single page id with -p', () => {
const res = parse_args(['-p', 'foo']); const res = parse_args(['-p', 'foo']);
expect(res.page_ids).toEqual(['https://www.facebook.com/foo/events/']); expect(res.page_id).toEqual('https://www.facebook.com/foo/events/');
}); });
it('parses a single page id with -p', () => { it('parses a single page id with -p', () => {
const res = parse_args(['-p', '"foo"']); const res = parse_args(['-p', '"foo"']);
expect(res.page_ids).toEqual(['https://www.facebook.com/foo/events/']); expect(res.page_id).toEqual('https://www.facebook.com/foo/events/');
}); });
it('parses a single page id with --page', () => { it('parses a single page id with --page', () => {
const res = parse_args(['--page=foo']); const res = parse_args(['--page=foo']);
expect(res.page_ids).toEqual(['https://www.facebook.com/foo/events/']); expect(res.page_id).toEqual('https://www.facebook.com/foo/events/');
}); });
it('parses a single page id with --page', () => { it('parses a single page id with --page', () => {
const res = parse_args(['--page="foo"']); const res = parse_args(['--page="foo"']);
expect(res.page_ids).toEqual(['https://www.facebook.com/foo/events/']); expect(res.page_id).toEqual('https://www.facebook.com/foo/events/');
});
it('parses multiple page ids with --pages', () => {
const res = parse_args(['--pages=foo,bar']);
expect(res.page_ids).toEqual([
'https://www.facebook.com/foo/events/',
'https://www.facebook.com/bar/events/',
]);
});
it('parses multiple page ids with --pages', () => {
const res = parse_args(['--pages="foo,bar"']);
expect(res.page_ids).toEqual([
'https://www.facebook.com/foo/events/',
'https://www.facebook.com/bar/events/',
]);
}); });
['-?', '--help', '-h'].forEach((param) => { ['-?', '--help', '-h'].forEach((param) => {

Loading…
Cancel
Save