Browse Source

working scrape of chrome windows

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
039c774f3c
  1. 51
      event-table.html
  2. 40
      index.html
  3. 94
      scrape.js

51
event-table.html

@ -1,51 +0,0 @@
<table class="_4dmd _4eok uiGrid _51mz" cols="4" cellspacing="0" cellpadding="0">
<tbody>
<tr class="_51mx">
<td class="_5px7 _51m-">
<span class="_5x8v _5a5j _5a5i">
<span class="_5a4-">
JUL</span>
<span class="_5a4z">
31</span>
</span>
</td>
<td class="_4dmi _51m-">
<div class="_4dmj">
<div class="_4dmk">
<a data-hovercard="/ajax/hovercard/event.php?id=1349220518586234" href="/events/1349220518586234/?acontext=%7B%22ref%22%3A51%2C%22source%22%3A5%2C%22action_history%22%3A[%7B%22surface%22%3A%22page%22%2C%22mechanism%22%3A%22main_list%22%2C%22extra_data%22%3A%22%5C%22[]%5C%22%22%7D]%2C%22has_source%22%3Atrue%7D">
<span class=" _50f7">
Bakgårdsfestivalens TørnQuiz </span>
</a>
</div>
<div class="_4dml fsm fwn fcg">
<span class="">
Fri 8:00 PM UTC+02</span>
<span aria-hidden="true">
· </span>
535 guests</div>
</div>
</td>
<td class="_5pxd _51m-">
<div class="_4dmn">
<div class="_30n-">
<a data-hovercard="/ajax/hovercard/hovercard.php?id=515909715151075" href="https://www.facebook.com/sgarden.no/">
Strømsøe-gården</a>
</div>
<div class="_30n_">
Levanger</div>
</div>
</td>
<td class="_4dmt _51mw _51m-">
<div class="_4dmu">
<div class="_2ib5">
<div class="_2ib4">
<a class="_4jy0 _4jy3 _517h _51sy _42ft" target="_blank" href="https://l.facebook.com/l.php?u=https%3A%2F%2Fkultar.no%2Fbilletter%2Fbakgardsfestivalen-tornquiz%2F&amp;h=AT2J45hkqUacEeIfbN4QiQiO4ayob0q77rCHxzA-pZzLX4nq4bTJ6d5j5zo0ZaE-Uljp7r5l2pYKBMAmaIYuTaZ3I6duSGnE97cgFXNWppV8pExLWlfVdifiMQGC9gjckFt8lyX1dPqwUw" rel="nofollow noopener" data-lynx-mode="hover">
Get Tickets</a>
</div>
</div>
</div>
</td>
</tr>
</tbody>
</table>

40
index.html

File diff suppressed because one or more lines are too long

94
scrape.js

@ -1,11 +1,10 @@
const puppeteer = require('puppeteer');
const { JSDOM } = require('jsdom');
// const event_page = 'https://www.facebook.com/KulturVarsel/events';
const event_page = 'file:///home/zalox/src/kultar-events/index.html';
const upcoming_event_id = 'upcoming_events_card';
const upcoming_event_selector = `#${upcoming_event_id}`;
// const event_page = 'https://www.facebook.com/pg/bandmaldito/events';
const event_page = 'file:///home/zalox/src/kultar-events/index.chrome.html';
const upcoming_event_id = 'past_events_card';
const upcoming_event_selector = `#pastup_events_card > div > div:nth-child(2) > table > tbody > tr`;
const month_name_to_number = (month_name) => {
switch (month_name.toUpperCase()) {
@ -24,29 +23,87 @@ const month_name_to_number = (month_name) => {
}
}
const parse_event_time = (event_time_text) => {
const reversed_text = event_time_text.split("").reverse().join("");
if (!(reversed_text.substr(2, 1) == '+')) {
return null;
}
const timezone = reversed_text.substr(0,6).split("").reverse().join("");
const minutes = reversed_text.substr(7,2).split("").reverse().join("");
const hour = reversed_text.substr(10,2).split("").reverse().join("").trim();
return {
tz: timezone,
min: minutes,
hour,
}
}
const parse_ticket_location = (row) => {
const host = row.children[2].firstChild.firstChild.firstChild.innerHTML
const location = row.children[2].firstChild.lastChild.innerHTML
return {
host,
location,
};
}
const parse_event_date = (row) => {
const date_column = row.firstChild.firstChild;
const month_text = new String(date_column.firstChild.innerHTML);
const month = new Number(month_name_to_number(month_text)) - 1;
const day = new Number(date_column.lastChild.innerHTML);
return {
day: Number(day),
month: Number(month_name_to_number(month_text)),
};
const event_time_text = row.children[1].lastChild.getElementsByTagName('span')[1].innerHTML;
const event_time = parse_event_time(event_time_text);
if (event_time === null) {
return new Date(2020, month, day);
}
return new Date(2020, month, day, event_time.hour, event_time.min);
}
const parse_event_link = (row) => {
const link_text = row.children[1].firstChild.getElementsByTagName('a')[0].href;
const event_id = link_text.split('/')[2];
return event_id;
}
const parse_event_name = (row) => {
const parse_ticket_url = (row) => {
const link_text = row.lastChild.firstChild.firstChild;
if (link_text.children.length === 0) {
return null;
}
const url = new URL(link_text.getElementsByTagName('a')[0].href)
.searchParams
.get('u');
return url;
}
const parse_event_name = (row) => {
const event_name = row.children[1].firstChild.getElementsByTagName('span')[0].innerHTML;
return event_name.trim();
}
const load_page = async () => {
try {
const browser = await puppeteer.launch();
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(event_page);
await page.waitForSelector(upcoming_event_selector);
await page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
await page.waitFor(1000);
let get_events = (upcoming_event_id) => {
const upcoming_events_element = document.getElementById(upcoming_event_id);
@ -75,11 +132,19 @@ const load_page = async () => {
const emptyArrays = item => item;
const parseRowToEvents = table_row => {
const date = parse_event_date(table_row);
const name = parse_event_name(table_row);
return { date, name };
const event_id = parse_event_link(table_row);
const ticket_url = parse_ticket_url(table_row);
const location = parse_ticket_location(table_row);
return {
date,
name,
event_id,
ticket_url,
location,
};
};
const parsed_events = events
@ -88,8 +153,6 @@ const load_page = async () => {
.map(parseRowToEvents)
;
console.log(parsed_events);
}
@ -101,6 +164,5 @@ const load_page = async () => {
(
async () => {
await load_page();
process.exit();
}
)();

Loading…
Cancel
Save