From 99cb863ca22e73c9bf4925a81ecf6ceae78c12c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Wed, 6 Jul 2022 18:21:24 +0200 Subject: [PATCH] update scraper --- bin/csv-to-server.mjs | 98 ++++++++++++++++++++++++++++++++ bin/scrape.mjs | 25 +++++--- src/facebook/get-page-events.mjs | 52 ++++++++--------- 3 files changed, 138 insertions(+), 37 deletions(-) create mode 100644 bin/csv-to-server.mjs diff --git a/bin/csv-to-server.mjs b/bin/csv-to-server.mjs new file mode 100644 index 0000000..f020f7d --- /dev/null +++ b/bin/csv-to-server.mjs @@ -0,0 +1,98 @@ +import fetch from 'node-fetch'; + +process.stdin.resume(); +process.stdin.setEncoding('utf8'); +let input = []; +process.stdin.on('data', (data) => { + input.push(data); +}); + +const token = + '1234567812345678123456781234567812345678123456781234567812345678'; + +const api = 'http://localhost:3333'; +const headers = { 'Content-Type': 'application/json' }; + +const updated = (oldEvent, scrapedEvent) => { + let keys = [ + 'canceled', + 'end', + 'start', + 'draft', + 'facebook_id', + 'place_id', + 'name', + 'ticket_url', + 'id' + ]; + for (let key of keys) { + if (oldEvent[key] != scrapedEvent[key]) { + return true; + } + } + return false; +}; + +process.stdin.on('end', async () => { + for (let event_line of input.join('').split('\n')) { + const [ + facebook_id = '', + location_name = '', + place_id = '', + name = '', + start = '', + ticket_url = '' + ] = event_line.split('¤'); + let payload = { + draft: false, + canceled: false, + facebook_id, + location_name, + place_id: Number(place_id), + name, + start: Number(start), + ticket_url + }; + + let search = await fetch( + `${api}/search/events/?facebook_id=${payload.facebook_id}&token=${token}` + ); + if (!search.ok) { + console.log(await search.text()); + continue; + } + search = await search.json(); + let new_event = search.length === 0; + let old_event; + if (!new_event) { + old_event = search[0]; + } + let res; + if (new_event) { + res = await fetch(`${api}/events/?token=${token}`, { + method: 'POST', + body: JSON.stringify(payload), + headers + }); + console.log(res.status, 'Insert', payload.name); + } else if (old_event && updated(old_event, payload)) { + payload.id = old_event.id; + if (old_event.ticket_url.length > 0 && payload.ticket_url.length == 0) { + payload.ticket_url = old_event.ticket_url; + } + if (updated(old_event, payload)) { + res = await fetch(`${api}/events/${payload.id}/?token=${token}`, { + method: 'PATCH', + body: JSON.stringify(payload), + headers + }); + console.log(res.status, 'Update', payload.name); + } else { + console.log(201, 'Skip', payload.name); + } + } else { + console.log(201, 'Skip', payload.name); + } + console.log(res.status, await res.text()); + } +}); diff --git a/bin/scrape.mjs b/bin/scrape.mjs index 261d1d4..f013c0d 100644 --- a/bin/scrape.mjs +++ b/bin/scrape.mjs @@ -50,20 +50,26 @@ const updated = (oldEvent, scrapedEvent) => { const scrape = place.scraper == 'facebook'; if (!scrape) { console.log( - 100, + 101, `Skipping #${place.id} ${place.name}. Reason: Scraper is ${place.scraper}` ); + return false; + } + const now = unix(new Date()); + const recently = place.last_scraped + place.scrape_threshold; + if (now < recently) { + console.log( + 100, + `Skipping #${place.id} ${place.name}. Reason: Was scraped ${ + now - place.last_scraped + }s ago.` + ); + return false; } - return scrape; + return true; }); for (let place of places) { - if (place.id < 13) { - console.log(100, `Skipping #${place.id} ${place.name}`); - continue; - } else { - console.log(100, `Scraping #${place.id} ${place.name}`); - } const events = await scrape(place.facebook_id); let payloads = []; for (let event of events) { @@ -78,11 +84,14 @@ const updated = (oldEvent, scrapedEvent) => { ticket_url: event.event_buy_ticket_url ?? '' }); } + console.log(payloads); for (let payload of payloads) { let search = await fetch( `${api}/search/events/?facebook_id=${payload.facebook_id}&token=${token}` ); if (!search.ok) { + console.log('hææææ'); + console.log(await search.text()); await sleep(200); continue; } diff --git a/src/facebook/get-page-events.mjs b/src/facebook/get-page-events.mjs index 3127805..f04a150 100644 --- a/src/facebook/get-page-events.mjs +++ b/src/facebook/get-page-events.mjs @@ -1,5 +1,4 @@ import { do_request } from './graphql-api-request.mjs'; -const sleep = (s) => new Promise((res) => setTimeout(res, s * 1000)); /// PageEventsTabPastEventsCardRendererQuery export const past_render_query = async ({ pageID }) => { @@ -71,35 +70,31 @@ export const get_page_events = async ({ }) => { let res = []; - if (get_past_events) { - const result = await past_render_query({ pageID }); - - if (result !== null) { - let { has_next_page, end_cursor: cursor } = result.page_info; - - let { edges } = result; - let retries = 0; - while (has_next_page) { - sleep(0.1); - const paginationResult = await past_pagination_query({ - cursor, - pageID - }); - if (paginationResult === null) { - ++retries; - console.error(retries, 'retrying'); - continue; - } - if (retries > 10) { - break; - } - retries = 0; - edges = [...edges, ...paginationResult.edges]; - has_next_page = paginationResult?.page_info?.has_next_page ?? false; - cursor = paginationResult.page_info.end_cursor; + const result = await past_render_query({ pageID }); + + if (result !== null) { + let { has_next_page, end_cursor: cursor } = result.page_info; + let { edges } = result; + let retries = 0; + while (get_past_events && has_next_page) { + const paginationResult = await past_pagination_query({ + cursor, + pageID + }); + if (paginationResult === null) { + ++retries; + console.error(retries, 'retrying'); + continue; } - res = [...edges.map(({ node }) => node)]; + if (retries > 10) { + break; + } + retries = 0; + edges = [...edges, ...paginationResult.edges]; + has_next_page = paginationResult?.page_info?.has_next_page ?? false; + cursor = paginationResult.page_info.end_cursor; } + res = [...edges.map(({ node }) => node)]; } if (get_upcoming_events) { @@ -109,7 +104,6 @@ export const get_page_events = async ({ let { edges } = result; let retries = 0; while (has_next_page) { - sleep(0.1); const paginationResult = await upcoming_pagination_query({ cursor, pageID