Push all old addons
Push of all my old addons that are possibly well broken...
This commit is contained in:
151
plugin.video.londonlive/resources/lib/academicearth/scraper.py
Normal file
151
plugin.video.londonlive/resources/lib/academicearth/scraper.py
Normal file
@@ -0,0 +1,151 @@
|
||||
'''
|
||||
academicearth.scraper
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains some functions which do the website scraping for the
|
||||
API module. You shouldn't have to use this module directly.
|
||||
'''
|
||||
import re
|
||||
from urllib2 import urlopen
|
||||
from urlparse import urljoin
|
||||
from BeautifulSoup import BeautifulSoup as BS
|
||||
|
||||
|
||||
BASE_URL = 'http://www.academicearth.org'
|
||||
def _url(path):
|
||||
'''Returns a full url for the given path'''
|
||||
return urljoin(BASE_URL, path)
|
||||
|
||||
|
||||
def get(url):
|
||||
'''Performs a GET request for the given url and returns the response'''
|
||||
conn = urlopen(url)
|
||||
resp = conn.read()
|
||||
conn.close()
|
||||
return resp
|
||||
|
||||
|
||||
def _html(url):
|
||||
'''Downloads the resource at the given url and parses via BeautifulSoup'''
|
||||
return BS(get(url), convertEntities=BS.HTML_ENTITIES)
|
||||
|
||||
|
||||
def make_showall_url(url):
|
||||
'''Takes an api url and appends info to the path to force the page to
|
||||
return all entries instead of paginating.
|
||||
'''
|
||||
if not url.endswith('/'):
|
||||
url += '/'
|
||||
return url + 'page:1/show:500'
|
||||
|
||||
|
||||
def get_subjects():
|
||||
'''Returns a list of subjects for the website. Each subject is a dict with
|
||||
keys of 'name' and 'url'.
|
||||
'''
|
||||
url = _url('subjects')
|
||||
html = _html(url)
|
||||
subjs = html.findAll('a',
|
||||
{'href': lambda attr_value: attr_value.startswith('/subjects/')
|
||||
and len(attr_value) > len('/subjects/')})
|
||||
|
||||
# subjs will contain some duplicates so we will key on url
|
||||
items = []
|
||||
urls = set()
|
||||
for subj in subjs:
|
||||
url = _url(subj['href'])
|
||||
if url not in urls:
|
||||
urls.add(url)
|
||||
items.append({
|
||||
'name': subj.string,
|
||||
'url': url,
|
||||
})
|
||||
|
||||
# filter out any items that didn't parse correctly
|
||||
return [item for item in items if item['name'] and item['url']]
|
||||
|
||||
|
||||
def get_subject_metadata(subject_url):
|
||||
'''Returns metadata for a subject parsed from the given url'''
|
||||
html = _html(make_showall_url(subject_url))
|
||||
name = get_subject_name(html)
|
||||
courses = get_courses(html)
|
||||
lectures = get_lectures(html)
|
||||
desc = get_subject_description(html)
|
||||
|
||||
return {
|
||||
'name': name,
|
||||
'courses': courses,
|
||||
'lectures': lectures,
|
||||
'description': desc,
|
||||
}
|
||||
|
||||
|
||||
def get_subject_name(html):
|
||||
return html.find('article').h1.text
|
||||
|
||||
|
||||
def get_course_name(html):
|
||||
return html.find('section', {'class': 'pagenav'}).span.text
|
||||
|
||||
|
||||
def get_lecture_name(html):
|
||||
return html.find('section', {'class': 'pagenav'}).span.text
|
||||
|
||||
|
||||
def get_subject_description(html):
|
||||
desc_nodes = html.find('article').findAll('span')
|
||||
return '\n'.join(node.text.strip() for node in desc_nodes)
|
||||
|
||||
|
||||
def _get_courses_or_lectures(class_type, html):
|
||||
'''class_type can be 'course' or 'lecture'.'''
|
||||
nodes = html.findAll('div', {'class': class_type})
|
||||
|
||||
items = [{
|
||||
'name': node.h3.text,
|
||||
'url': _url(node.a['href']),
|
||||
'icon': node.img['src'],
|
||||
#'university': '',
|
||||
#'speaker': '',
|
||||
} for node in nodes]
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def get_lectures(html):
|
||||
return _get_courses_or_lectures('lecture', html)
|
||||
|
||||
|
||||
def get_courses(html):
|
||||
return _get_courses_or_lectures('course', html)
|
||||
|
||||
|
||||
def get_course_metadata(course_url):
|
||||
html = _html(make_showall_url(course_url))
|
||||
lectures = get_lectures(html)
|
||||
name = get_course_name(html)
|
||||
return {
|
||||
'lectures': lectures,
|
||||
'name': name,
|
||||
}
|
||||
|
||||
|
||||
def get_lecture_metadata(lecture_url):
|
||||
html = _html(lecture_url)
|
||||
name = get_lecture_name(html)
|
||||
youtube_id = parse_youtube_id(html)
|
||||
return {
|
||||
'name': name,
|
||||
'youtube_id': youtube_id
|
||||
}
|
||||
|
||||
|
||||
|
||||
def parse_youtube_id(html):
|
||||
embed = html.find('embed')
|
||||
yt_ptn = re.compile(r'http://www.youtube.com/v/(.+?)\?')
|
||||
match = yt_ptn.search(embed['src'])
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
Reference in New Issue
Block a user