Browse Source

update scraper

fix-broken-scrape
Jørgen Sverre Lien Sellæg 4 years ago
parent
commit
5701c5b8bf
  1. 82
      bin/run.sh
  2. 24
      bin/scrape
  3. 13
      src/facebook/get-page-events.mjs
  4. 7
      src/facebook/graphql-api-request.mjs

82
bin/run.sh

@ -0,0 +1,82 @@
#!/usr/bin/env bash
mkdir -p events
# bin/scrape 145869695444569 >> events/AftermathMusicNorway-145869695444569.json
# bin/scrape 774453762576345 >> events/BarnefestivalenJubaJuba-774453762576345.json
bin/scrape 750972958283466 >> events/Boxerbar-750972958283466.old.json
bin/scrape 153799614679194 >> events/Britannia-153799614679194.Hotel.old.json
bin/scrape 380071843764 >> events/ByscenenTrondheim-380071843764.old.json
bin/scrape 220501644709429 >> events/CinemateketTrondheim-220501644709429.old.json
bin/scrape 225737590803212 >> events/Diskoteketbar-225737590803212.old.json
bin/scrape 325557631163470 >> events/Dora3Trondheim-325557631163470.old.json
bin/scrape 784700614931033 >> events/Goodomensbar-784700614931033.old.json
bin/scrape 1673051982745157 >> events/HjortenScene-1673051982745157.old.json
bin/scrape 11541693212 >> events/ISFiTNorway-11541693212.old.json
bin/scrape 1419662314995709 >> events/KafeSkuret-1419662314995709.old.json
bin/scrape 1092418890774152 >> events/Kafelarssen-1092418890774152.old.json
bin/scrape 154134355099614 >> events/KonsertGalleriet-154134355099614.old.json
bin/scrape 2198343290262409.trondheim >> events/Lager11-2198343290262409.trondheim.old.json
bin/scrape 287716286232 >> events/Pstereo-287716286232.old.json
bin/scrape 111985600537696 >> events/Sommer-i-Borggården-111985600537696-111985600537696.old.json
bin/scrape 151868258198492 >> events/Sukkerhuset-151868258198492.old.json
bin/scrape 274218068628 >> events/TRONDHEIMISAK-274218068628.old.json
bin/scrape 705753499633189 >> events/TrondheimCamping-705753499633189.old.json
bin/scrape 214229302015789 >> events/TrondheimJazzforum-214229302015789.old.json
bin/scrape 311676952866578 >> events/TrondheimLive-311676952866578.old.json
bin/scrape 114647705270950 >> events/TrondheimMetalFest-114647705270950.old.json
bin/scrape 1866856830009864 >> events/TrondheimRocks-1866856830009864.old.json
bin/scrape 175439175828277 >> events/TrondheimSpektrum-175439175828277.old.json
bin/scrape 108747454194270 >> events/TrykkerietScene-108747454194270.old.json
bin/scrape 626731467352429 >> events/WorkWorkTrd-626731467352429.old.json
bin/scrape 300444313497473 >> events/almasrestaurant-300444313497473.old.json
bin/scrape 106346922773023 >> events/antikvarene-106346922773023.old.json
bin/scrape 150118755488 >> events/artscenetrondheim-150118755488.old.json
bin/scrape 158501780863403 >> events/barcircus-158501780863403.old.json
bin/scrape 160806787399376 >> events/barmoskus-160806787399376.old.json
bin/scrape 1712859978926496 >> events/brorbaras-1712859978926496.old.json
bin/scrape 191072824261732 >> events/cafenim-191072824261732.old.json
bin/scrape 129738423764486 >> events/cafenimuser-129738423764486.old.json
bin/scrape 118990964789505 >> events/clarioncollectiongrandolav-118990964789505.old.json
bin/scrape 148274751888665 >> events/clariontrondheim-148274751888665.old.json
bin/scrape 378038755927834 >> events/comforthpark-378038755927834.old.json
bin/scrape 271908336649895 >> events/crispingloverrecordshop-271908336649895.old.json
bin/scrape 860188704010725 >> events/feminalen-860188704010725.old.json
bin/scrape 654277341296114 >> events/fru-654277341296114.lundgreen.old.json
bin/scrape 955982084466587 >> events/habitattrondheim-955982084466587.old.json
bin/scrape 143110949094001 >> events/ilabrainnstasjon-143110949094001.old.json
bin/scrape 105916402763134 >> events/jazzfestno-105916402763134.old.json
bin/scrape 80792973478 >> events/kamfest-80792973478.old.json
bin/scrape 137220573544106 >> events/komikerliv-137220573544106.old.json
bin/scrape 1402925659778914 >> events/konsertkollektiv-1402925659778914.old.json
bin/scrape 642243462457769 >> events/kulturnattrondheim-642243462457769.old.json
bin/scrape 246603018828671 >> events/kunsthalltrondheim-246603018828671.old.json
bin/scrape 180524565650597 >> events/ladekaia-180524565650597.old.json
bin/scrape 1552366408164455.scenen >> events/lamon-1552366408164455.scenen.old.json
bin/scrape 101846779891549 >> events/lobbyen-101846779891549.old.json
bin/scrape 191245541651071.trhm >> events/lokal-191245541651071.trhm.old.json
bin/scrape 346065342451673 >> events/midtbyenrockeklubb-346065342451673.old.json
bin/scrape 1786002618299415 >> events/nidelvenbarogscene-1786002618299415.old.json
bin/scrape 90016673206 >> events/olavsfest-90016673206.old.json
bin/scrape 149127815110411 >> events/olavshallenas-149127815110411.old.json
bin/scrape 130558850296573 >> events/qualityhotelpanorama-130558850296573.old.json
bin/scrape 286956505871 >> events/ringvemusikkmuseum-286956505871.old.json
bin/scrape 68623573578 >> events/rockheim-68623573578.old.json
bin/scrape 11782406887 >> events/samfundet-11782406887.old.json
bin/scrape 375949715790210 >> events/scandicbakklandet-375949715790210.old.json
bin/scrape 260116090806604 >> events/scandiclerkendal-260116090806604.old.json
bin/scrape 127377617334855 >> events/scandicnidelven-127377617334855.old.json
bin/scrape 799107733481572 >> events/scandicsolsiden-799107733481572.old.json
bin/scrape 169539553659073.trondheim >> events/scenekanten-169539553659073.trondheim.old.json
bin/scrape 110148939061528 >> events/solsidentrondheim-110148939061528.old.json
bin/scrape 109571644107406 >> events/stormfestivalen-109571644107406.old.json
bin/scrape 484033441709379 >> events/stundomrecords-484033441709379.old.json
bin/scrape 208801362485221 >> events/thonhotelprinsen-208801362485221.old.json
bin/scrape 1649281185339329 >> events/trollrestaurant-1649281185339329.old.json
bin/scrape 33687748452 >> events/trondercore-33687748452.old.json
bin/scrape 112475092130212.kunstmuseum >> events/trondheim-112475092130212.kunstmuseum.old.json
bin/scrape 192236567852623 >> events/trondheimbluesklubb-192236567852623.old.json
bin/scrape 125977737456625 >> events/trondheimcalling-125977737456625.old.json
bin/scrape 16582146804 >> events/trondheimfolkebibliotek-16582146804.old.json
bin/scrape 1858914204185866 >> events/trondheimstage-1858914204185866.old.json
bin/scrape 237410323543222 >> events/tyventrondheim-237410323543222.old.json
bin/scrape 697776233588499 >> events/uffabookingcrew-697776233588499.old.json
bin/scrape 965958530110723 >> events/vaarfruekirke-965958530110723.old.json

24
bin/scrape

@ -0,0 +1,24 @@
#!/usr/bin/env node
const sleep = (s) => new Promise((res) => setTimeout(res, s * 1000));
const minimist = require('minimist');
const number = process.argv[2];
if (number == null) {
console.error('no page selected');
}
(async () => {
const { get_page_events } = await import(
'../src/facebook/get-page-events.mjs'
);
try {
const res = await get_page_events({
pageID: number,
get_upcoming_events: false,
get_past_events: true
});
console.log(JSON.stringify(res));
} catch (e) {
console.error('error: ', e);
}
})();

13
src/facebook/get-page-events.mjs

@ -35,7 +35,7 @@ export const upcoming_pagination_query = async ({ pageID, cursor }) => {
return page; return page;
}; };
const get_page_events = async ({ export const get_page_events = async ({
pageID, pageID,
get_past_events, get_past_events,
get_upcoming_events get_upcoming_events
@ -49,6 +49,7 @@ const get_page_events = async ({
let { has_next_page, end_cursor: cursor } = result.page_info; let { has_next_page, end_cursor: cursor } = result.page_info;
let { edges } = result; let { edges } = result;
let retries = 0;
while (has_next_page) { while (has_next_page) {
sleep(2); sleep(2);
const paginationResult = await past_pagination_query({ const paginationResult = await past_pagination_query({
@ -56,8 +57,13 @@ const get_page_events = async ({
pageID pageID
}); });
if (paginationResult === null) { if (paginationResult === null) {
++retries;
continue;
}
if (retries > 10) {
break; break;
} }
retries = 0;
edges = [...edges, ...paginationResult.edges]; edges = [...edges, ...paginationResult.edges];
has_next_page = paginationResult?.page_info?.has_next_page ?? false; has_next_page = paginationResult?.page_info?.has_next_page ?? false;
cursor = paginationResult.page_info.end_cursor; cursor = paginationResult.page_info.end_cursor;
@ -74,6 +80,7 @@ const get_page_events = async ({
upcoming_events = []; upcoming_events = [];
let { has_next_page, end_cursor: cursor } = result.page_info; let { has_next_page, end_cursor: cursor } = result.page_info;
let { edges } = result; let { edges } = result;
let retries = 0;
while (has_next_page) { while (has_next_page) {
sleep(2); sleep(2);
const paginationResult = await upcoming_pagination_query({ const paginationResult = await upcoming_pagination_query({
@ -81,6 +88,10 @@ const get_page_events = async ({
pageID pageID
}); });
if (paginationResult === null) { if (paginationResult === null) {
++retries;
continue;
}
if (retries > 10) {
break; break;
} }
edges = [...edges, ...paginationResult.edges]; edges = [...edges, ...paginationResult.edges];

7
src/facebook/graphql-api-request.mjs

@ -18,10 +18,13 @@ const random_int = (max, min) => Math.floor(Math.random() * (max - min) + min);
export const do_request = async (doc_id, variables, parse = true) => { export const do_request = async (doc_id, variables, parse = true) => {
const params = new URLSearchParams(); const params = new URLSearchParams();
const { ip, port, user, password } = proxies[random_int(0, 4)]; // const { ip, port, user, password } = proxies[random_int(0, proxies.length)];
const ip = "127.0.0.1";
const port = "24000";
let proxyOpts = url.parse(`http://${ip}:${port}`); let proxyOpts = url.parse(`http://${ip}:${port}`);
proxyOpts.auth = `${user}:${password}`; // proxyOpts.auth = `${user}:${password}`;
const agent = new https_proxy_agent(proxyOpts); const agent = new https_proxy_agent(proxyOpts);
params.append('doc_id', doc_id); params.append('doc_id', doc_id);
params.append('variables', JSON.stringify(variables)); params.append('variables', JSON.stringify(variables));

Loading…
Cancel
Save