Browse Source

seperate browse code into own file

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
fe6d6054bf
  1. 142
      src/browse.js
  2. 137
      src/logic.js
  3. 10
      src/scrape.js

142
src/browse.js

@ -0,0 +1,142 @@
const puppeteer = require('puppeteer');
const {
has_past_events,
get_body_inner_text,
has_upcoming_events,
get_upcoming_events_from_page,
get_past_events_from_page,
map_event,
} = require('./logic');
const { graphql_endpoint } = require('./constants');
const open_browser = async ({ headless }) => {
const browser = await puppeteer.launch({
headless,
args: ['--disable-dev-shm-usage'],
});
return browser;
};
const register_page_scraper = (endpoint, page, past_events = false) => {
let responses = [];
return new Promise((resolve, reject) => {
page.on('response', async (response) => {
if (endpoint === response.request().url()) {
let json = {};
try {
json = await response.json();
} catch (error) {
return responses;
}
const getters = {
upcoming: get_upcoming_events_from_page,
past: get_past_events_from_page,
};
const events = getters[past_events ? 'past' : 'upcoming'](json);
if (events !== null) {
responses = [events, ...responses];
if (!events.page_info.has_next_page) {
resolve(responses);
}
}
}
});
});
};
const get_page_events = async (
browser,
page_id,
get_upcoming_events = true,
get_past_events = false,
) => {
const facebook_page = await browser.newPage();
let past_events = [];
let upcoming_events = [];
let scraping_past_events = false;
let scraping_upcoming_events = false;
if (get_past_events) {
scraping_past_events = true;
past_events = register_page_scraper(graphql_endpoint, facebook_page, true)
.then((past_events) => {
scraping_past_events = false;
return past_events;
})
.catch((err) => {
console.error(err);
scraping_past_events = false;
return [];
});
} else {
past_events = Promise.resolve([]);
}
if (get_upcoming_events) {
scraping_upcoming_events = true;
upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
.then((upcoming_events) => {
scraping_upcoming_events = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping_upcoming_events = false;
return [];
});
} else {
upcoming_events = Promise.resolve([]);
}
await facebook_page.goto(page_id);
await facebook_page.waitFor(2000);
const accept_buttons = await facebook_page.$x(
"//button[contains(text(), 'Accept All')]",
);
if (accept_buttons.length > 0) {
accept_buttons[0].click();
}
const body_text = (await get_body_inner_text(facebook_page)).toLowerCase();
const past_resolved = get_past_events && !has_past_events(body_text);
const upcoming_resolved =
get_upcoming_events && !has_upcoming_events(body_text);
if (past_resolved) {
past_events = Promise.resolve([]);
scraping_past_events = false;
}
if (upcoming_resolved) {
upcoming_events = Promise.resolve([]);
scraping_upcoming_events = false;
}
while (scraping_past_events || scraping_upcoming_events) {
await facebook_page.waitFor(1000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
if (past_resolved && upcoming_resolved) {
break;
}
}
upcoming_events = await upcoming_events;
past_events = await past_events;
const responses = [...upcoming_events, ...past_events];
const nodes = responses.reduce(
(res, current) => [...res, ...current.edges],
[],
);
return nodes.map(map_event);
};
module.exports = {
get_page_events,
open_browser,
};

137
src/logic.js

@ -4,13 +4,10 @@ const process = require('process');
const page_url = (page_id) => `https://www.facebook.com/${page_id}`; const page_url = (page_id) => `https://www.facebook.com/${page_id}`;
const page_events_url = (page_id) => page_url(page_id) + '/events/'; const page_events_url = (page_id) => page_url(page_id) + '/events/';
const { graphql_endpoint } = require('./constants');
const fs = require('fs').promises; const fs = require('fs').promises;
const filesystem = require('fs'); const filesystem = require('fs');
const puppeteer = require('puppeteer');
const flatten_string = (page_id) => { const flatten_string = (page_id) => {
if (page_id.startsWith('"') && page_id.endsWith('"')) { if (page_id.startsWith('"') && page_id.endsWith('"')) {
return page_id.slice(1, page_id.length - 1); return page_id.slice(1, page_id.length - 1);
@ -132,14 +129,6 @@ const map_event = ({ node: event }) => {
}; };
}; };
const open_browser = async ({ headless }) => {
const browser = await puppeteer.launch({
headless,
args: ['--disable-dev-shm-usage'],
});
return browser;
};
const get_body_inner_text = async (page) => const get_body_inner_text = async (page) =>
await page.evaluate('document.querySelector("body").innerText;'); await page.evaluate('document.querySelector("body").innerText;');
@ -150,128 +139,14 @@ const has_upcoming_events = (body) =>
const has_past_events = (body) => const has_past_events = (body) =>
body.includes('past events') && !body.includes('not have any past events'); body.includes('past events') && !body.includes('not have any past events');
const register_page_scraper = (endpoint, page, past_events = false) => {
let responses = [];
return new Promise((resolve, reject) => {
page.on('response', async (response) => {
if (endpoint === response.request().url()) {
let json = {};
try {
json = await response.json();
} catch (error) {
return responses;
}
const getters = {
upcoming: get_upcoming_events_from_page,
past: get_past_events_from_page,
};
const events = getters[past_events ? 'past' : 'upcoming'](json);
if (events !== null) {
responses = [events, ...responses];
if (!events.page_info.has_next_page) {
resolve(responses);
}
}
}
});
});
};
const get_page_events = async (
browser,
page_id,
get_upcoming_events = true,
get_past_events = false,
) => {
const facebook_page = await browser.newPage();
let past_events = [];
let upcoming_events = [];
let scraping_past_events = false;
let scraping_upcoming_events = false;
if (get_past_events) {
scraping_past_events = true;
past_events = register_page_scraper(graphql_endpoint, facebook_page, true)
.then((past_events) => {
scraping_past_events = false;
return past_events;
})
.catch((err) => {
console.error(err);
scraping_past_events = false;
return [];
});
} else {
past_events = Promise.resolve([]);
}
if (get_upcoming_events) {
scraping_upcoming_events = true;
upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
.then((upcoming_events) => {
scraping_upcoming_events = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping_upcoming_events = false;
return [];
});
} else {
upcoming_events = Promise.resolve([]);
}
await facebook_page.goto(page_id);
await facebook_page.waitFor(2000);
const accept_buttons = await facebook_page.$x(
"//button[contains(text(), 'Accept All')]",
);
if (accept_buttons.length > 0) {
accept_buttons[0].click();
}
const body_text = (await get_body_inner_text(facebook_page)).toLowerCase();
const past_resolved = get_past_events && !has_past_events(body_text);
const upcoming_resolved =
get_upcoming_events && !has_upcoming_events(body_text);
if (past_resolved) {
past_events = Promise.resolve([]);
scraping_past_events = false;
}
if (upcoming_resolved) {
upcoming_events = Promise.resolve([]);
scraping_upcoming_events = false;
}
while (scraping_past_events || scraping_upcoming_events) {
await facebook_page.waitFor(1000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
if (past_resolved && upcoming_resolved) {
break;
}
}
upcoming_events = await upcoming_events;
past_events = await past_events;
const responses = [...upcoming_events, ...past_events];
const nodes = responses.reduce(
(res, current) => [...res, ...current.edges],
[],
);
return nodes.map(map_event);
};
module.exports = { module.exports = {
get_page_events, has_past_events,
get_body_inner_text,
has_upcoming_events,
get_upcoming_events_from_page,
get_past_events_from_page,
map_event,
merge_edges, merge_edges,
open_browser,
parse_args, parse_args,
read_previous_events, read_previous_events,
}; };

10
src/scrape.js

@ -1,12 +1,6 @@
const { pathOr, uniqBy, eqBy, prop, union } = require('ramda'); const { pathOr, uniqBy, eqBy, prop, union } = require('ramda');
const { merge_edges, parse_args, read_previous_events } = require('./logic');
const { const { open_browser, get_page_events } = require('./browser');
get_page_events,
merge_edges,
open_browser,
parse_args,
read_previous_events,
} = require('./logic');
const { const {
events: event_file, events: event_file,

Loading…
Cancel
Save