Browse Source

update scraper

fix-broken-scrape
Jørgen Sverre Lien Sellæg 4 years ago
parent
commit
99cb863ca2
  1. 98
      bin/csv-to-server.mjs
  2. 25
      bin/scrape.mjs
  3. 8
      src/facebook/get-page-events.mjs

98
bin/csv-to-server.mjs

@ -0,0 +1,98 @@
import fetch from 'node-fetch';
process.stdin.resume();
process.stdin.setEncoding('utf8');
let input = [];
process.stdin.on('data', (data) => {
input.push(data);
});
const token =
'1234567812345678123456781234567812345678123456781234567812345678';
const api = 'http://localhost:3333';
const headers = { 'Content-Type': 'application/json' };
const updated = (oldEvent, scrapedEvent) => {
let keys = [
'canceled',
'end',
'start',
'draft',
'facebook_id',
'place_id',
'name',
'ticket_url',
'id'
];
for (let key of keys) {
if (oldEvent[key] != scrapedEvent[key]) {
return true;
}
}
return false;
};
process.stdin.on('end', async () => {
for (let event_line of input.join('').split('\n')) {
const [
facebook_id = '',
location_name = '',
place_id = '',
name = '',
start = '',
ticket_url = ''
] = event_line.split('¤');
let payload = {
draft: false,
canceled: false,
facebook_id,
location_name,
place_id: Number(place_id),
name,
start: Number(start),
ticket_url
};
let search = await fetch(
`${api}/search/events/?facebook_id=${payload.facebook_id}&token=${token}`
);
if (!search.ok) {
console.log(await search.text());
continue;
}
search = await search.json();
let new_event = search.length === 0;
let old_event;
if (!new_event) {
old_event = search[0];
}
let res;
if (new_event) {
res = await fetch(`${api}/events/?token=${token}`, {
method: 'POST',
body: JSON.stringify(payload),
headers
});
console.log(res.status, 'Insert', payload.name);
} else if (old_event && updated(old_event, payload)) {
payload.id = old_event.id;
if (old_event.ticket_url.length > 0 && payload.ticket_url.length == 0) {
payload.ticket_url = old_event.ticket_url;
}
if (updated(old_event, payload)) {
res = await fetch(`${api}/events/${payload.id}/?token=${token}`, {
method: 'PATCH',
body: JSON.stringify(payload),
headers
});
console.log(res.status, 'Update', payload.name);
} else {
console.log(201, 'Skip', payload.name);
}
} else {
console.log(201, 'Skip', payload.name);
}
console.log(res.status, await res.text());
}
});

25
bin/scrape.mjs

@ -50,20 +50,26 @@ const updated = (oldEvent, scrapedEvent) => {
const scrape = place.scraper == 'facebook';
if (!scrape) {
console.log(
100,
101,
`Skipping #${place.id} ${place.name}. Reason: Scraper is ${place.scraper}`
);
return false;
}
return scrape;
const now = unix(new Date());
const recently = place.last_scraped + place.scrape_threshold;
if (now < recently) {
console.log(
100,
`Skipping #${place.id} ${place.name}. Reason: Was scraped ${
now - place.last_scraped
}s ago.`
);
return false;
}
return true;
});
for (let place of places) {
if (place.id < 13) {
console.log(100, `Skipping #${place.id} ${place.name}`);
continue;
} else {
console.log(100, `Scraping #${place.id} ${place.name}`);
}
const events = await scrape(place.facebook_id);
let payloads = [];
for (let event of events) {
@ -78,11 +84,14 @@ const updated = (oldEvent, scrapedEvent) => {
ticket_url: event.event_buy_ticket_url ?? ''
});
}
console.log(payloads);
for (let payload of payloads) {
let search = await fetch(
`${api}/search/events/?facebook_id=${payload.facebook_id}&token=${token}`
);
if (!search.ok) {
console.log('hææææ');
console.log(await search.text());
await sleep(200);
continue;
}

8
src/facebook/get-page-events.mjs

@ -1,5 +1,4 @@
import { do_request } from './graphql-api-request.mjs';
const sleep = (s) => new Promise((res) => setTimeout(res, s * 1000));
/// PageEventsTabPastEventsCardRendererQuery
export const past_render_query = async ({ pageID }) => {
@ -71,16 +70,13 @@ export const get_page_events = async ({
}) => {
let res = [];
if (get_past_events) {
const result = await past_render_query({ pageID });
if (result !== null) {
let { has_next_page, end_cursor: cursor } = result.page_info;
let { edges } = result;
let retries = 0;
while (has_next_page) {
sleep(0.1);
while (get_past_events && has_next_page) {
const paginationResult = await past_pagination_query({
cursor,
pageID
@ -100,7 +96,6 @@ export const get_page_events = async ({
}
res = [...edges.map(({ node }) => node)];
}
}
if (get_upcoming_events) {
const result = await upcoming_render_query({ pageID });
@ -109,7 +104,6 @@ export const get_page_events = async ({
let { edges } = result;
let retries = 0;
while (has_next_page) {
sleep(0.1);
const paginationResult = await upcoming_pagination_query({
cursor,
pageID

Loading…
Cancel
Save