From 039c774f3cc4e205570b1077575076a88184e5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Tue, 5 May 2020 14:37:04 +0200 Subject: [PATCH] working scrape of chrome windows --- event-table.html | 51 -------------------------- index.html | 40 --------------------- scrape.js | 94 +++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 78 insertions(+), 107 deletions(-) delete mode 100644 event-table.html delete mode 100644 index.html diff --git a/event-table.html b/event-table.html deleted file mode 100644 index 2884233..0000000 --- a/event-table.html +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - - - - -
- - - JUL - - 31 - - -
- -
- - Fri 8:00 PM UTC+02 - - 535 guests
-
-
-
- -
- Levanger
-
-
-
- -
-
- diff --git a/index.html b/index.html deleted file mode 100644 index a0c26b7..0000000 --- a/index.html +++ /dev/null @@ -1,40 +0,0 @@ - - KulturVarsel - Events | Facebook - - - - - - - - - - - - - - - - - - - - - - - - -
Jump to
Press alt + / to open this menu
See more of KulturVarsel on Facebook
See more of KulturVarsel on Facebook
or
Create New Account
Upcoming Events
MAY15
Fri 9:00 PM UTC+02327 guests
Verdalsøra
JUN19
Fri 7:00 PM UTC+02136 guests
JUL17
Jul 17 - Jul 19283 guests
Totsåsrock
JUL30
Thu 7:00 PM UTC+02358 guests
JUL30
Thu 8:00 PM UTC+0289 guests
Strømsøe-gården, Kirkegata 35
JUL30
Thu 8:00 PM UTC+02530 guests
Levanger
JUL30
Thu 10:00 PM UTC+02294 guests
JUL31
Fri 6:30 PM UTC+02478 guests
Festiviteten Levanger, Kirkegata 18
JUL31
Fri 8:00 PM UTC+02535 guests
JUL31
Fri 8:00 PM UTC+02325 guests
Havna Scene, Helga den Fagres gate 12
NOV14
Sat 6:00 PM UTC+011,078 guests
Levanger
Past Events
APR24
Fri 7:00 PM UTC+0281 guests
Verdalsøra
MAR19
Thu 7:00 PM UTC+01613 guests
MAR7
Feb 8 - Mar 8133 guests
Skatvalsrevyen
- - - - - diff --git a/scrape.js b/scrape.js index 1296ca8..780558a 100644 --- a/scrape.js +++ b/scrape.js @@ -1,11 +1,10 @@ const puppeteer = require('puppeteer'); const { JSDOM } = require('jsdom'); - // const event_page = 'https://www.facebook.com/KulturVarsel/events'; -const event_page = 'file:///home/zalox/src/kultar-events/index.html'; -const upcoming_event_id = 'upcoming_events_card'; -const upcoming_event_selector = `#${upcoming_event_id}`; - +// const event_page = 'https://www.facebook.com/pg/bandmaldito/events'; +const event_page = 'file:///home/zalox/src/kultar-events/index.chrome.html'; +const upcoming_event_id = 'past_events_card'; +const upcoming_event_selector = `#pastup_events_card > div > div:nth-child(2) > table > tbody > tr`; const month_name_to_number = (month_name) => { switch (month_name.toUpperCase()) { @@ -24,29 +23,87 @@ const month_name_to_number = (month_name) => { } } +const parse_event_time = (event_time_text) => { + const reversed_text = event_time_text.split("").reverse().join(""); + if (!(reversed_text.substr(2, 1) == '+')) { + return null; + } + + const timezone = reversed_text.substr(0,6).split("").reverse().join(""); + const minutes = reversed_text.substr(7,2).split("").reverse().join(""); + const hour = reversed_text.substr(10,2).split("").reverse().join("").trim(); + return { + tz: timezone, + min: minutes, + hour, + } +} + +const parse_ticket_location = (row) => { + const host = row.children[2].firstChild.firstChild.firstChild.innerHTML + const location = row.children[2].firstChild.lastChild.innerHTML + return { + host, + location, + }; +} + const parse_event_date = (row) => { const date_column = row.firstChild.firstChild; const month_text = new String(date_column.firstChild.innerHTML); + const month = new Number(month_name_to_number(month_text)) - 1; const day = new Number(date_column.lastChild.innerHTML); - return { - day: Number(day), - month: Number(month_name_to_number(month_text)), - }; + const event_time_text = row.children[1].lastChild.getElementsByTagName('span')[1].innerHTML; + const event_time = parse_event_time(event_time_text); + + if (event_time === null) { + return new Date(2020, month, day); + } + + return new Date(2020, month, day, event_time.hour, event_time.min); +} + +const parse_event_link = (row) => { + const link_text = row.children[1].firstChild.getElementsByTagName('a')[0].href; + const event_id = link_text.split('/')[2]; + return event_id; } +const parse_ticket_url = (row) => { + const link_text = row.lastChild.firstChild.firstChild; + + if (link_text.children.length === 0) { + return null; + } + + const url = new URL(link_text.getElementsByTagName('a')[0].href) + .searchParams + + .get('u'); + + return url; +} const parse_event_name = (row) => { - + const event_name = row.children[1].firstChild.getElementsByTagName('span')[0].innerHTML; + return event_name.trim(); } const load_page = async () => { try { - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ headless: true }); + const page = await browser.newPage(); await page.goto(event_page); await page.waitForSelector(upcoming_event_selector); + await page.evaluate(() => { + window.scrollBy(0, window.innerHeight); + }); + + await page.waitFor(1000); + let get_events = (upcoming_event_id) => { const upcoming_events_element = document.getElementById(upcoming_event_id); @@ -75,11 +132,19 @@ const load_page = async () => { const emptyArrays = item => item; - const parseRowToEvents = table_row => { const date = parse_event_date(table_row); const name = parse_event_name(table_row); - return { date, name }; + const event_id = parse_event_link(table_row); + const ticket_url = parse_ticket_url(table_row); + const location = parse_ticket_location(table_row); + return { + date, + name, + event_id, + ticket_url, + location, + }; }; const parsed_events = events @@ -87,8 +152,6 @@ const load_page = async () => { .filter(emptyArrays) .map(parseRowToEvents) ; - - console.log(parsed_events); } @@ -101,6 +164,5 @@ const load_page = async () => { ( async () => { await load_page(); - process.exit(); } )();