152 lines
3.8 KiB
Python
152 lines
3.8 KiB
Python
'''
|
|
academicearth.scraper
|
|
~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
This module contains some functions which do the website scraping for the
|
|
API module. You shouldn't have to use this module directly.
|
|
'''
|
|
import re
|
|
from urllib2 import urlopen
|
|
from urlparse import urljoin
|
|
from BeautifulSoup import BeautifulSoup as BS
|
|
|
|
|
|
BASE_URL = 'http://www.academicearth.org'
|
|
def _url(path):
|
|
'''Returns a full url for the given path'''
|
|
return urljoin(BASE_URL, path)
|
|
|
|
|
|
def get(url):
|
|
'''Performs a GET request for the given url and returns the response'''
|
|
conn = urlopen(url)
|
|
resp = conn.read()
|
|
conn.close()
|
|
return resp
|
|
|
|
|
|
def _html(url):
|
|
'''Downloads the resource at the given url and parses via BeautifulSoup'''
|
|
return BS(get(url), convertEntities=BS.HTML_ENTITIES)
|
|
|
|
|
|
def make_showall_url(url):
|
|
'''Takes an api url and appends info to the path to force the page to
|
|
return all entries instead of paginating.
|
|
'''
|
|
if not url.endswith('/'):
|
|
url += '/'
|
|
return url + 'page:1/show:500'
|
|
|
|
|
|
def get_subjects():
|
|
'''Returns a list of subjects for the website. Each subject is a dict with
|
|
keys of 'name' and 'url'.
|
|
'''
|
|
url = _url('subjects')
|
|
html = _html(url)
|
|
subjs = html.findAll('a',
|
|
{'href': lambda attr_value: attr_value.startswith('/subjects/')
|
|
and len(attr_value) > len('/subjects/')})
|
|
|
|
# subjs will contain some duplicates so we will key on url
|
|
items = []
|
|
urls = set()
|
|
for subj in subjs:
|
|
url = _url(subj['href'])
|
|
if url not in urls:
|
|
urls.add(url)
|
|
items.append({
|
|
'name': subj.string,
|
|
'url': url,
|
|
})
|
|
|
|
# filter out any items that didn't parse correctly
|
|
return [item for item in items if item['name'] and item['url']]
|
|
|
|
|
|
def get_subject_metadata(subject_url):
|
|
'''Returns metadata for a subject parsed from the given url'''
|
|
html = _html(make_showall_url(subject_url))
|
|
name = get_subject_name(html)
|
|
courses = get_courses(html)
|
|
lectures = get_lectures(html)
|
|
desc = get_subject_description(html)
|
|
|
|
return {
|
|
'name': name,
|
|
'courses': courses,
|
|
'lectures': lectures,
|
|
'description': desc,
|
|
}
|
|
|
|
|
|
def get_subject_name(html):
|
|
return html.find('article').h1.text
|
|
|
|
|
|
def get_course_name(html):
|
|
return html.find('section', {'class': 'pagenav'}).span.text
|
|
|
|
|
|
def get_lecture_name(html):
|
|
return html.find('section', {'class': 'pagenav'}).span.text
|
|
|
|
|
|
def get_subject_description(html):
|
|
desc_nodes = html.find('article').findAll('span')
|
|
return '\n'.join(node.text.strip() for node in desc_nodes)
|
|
|
|
|
|
def _get_courses_or_lectures(class_type, html):
|
|
'''class_type can be 'course' or 'lecture'.'''
|
|
nodes = html.findAll('div', {'class': class_type})
|
|
|
|
items = [{
|
|
'name': node.h3.text,
|
|
'url': _url(node.a['href']),
|
|
'icon': node.img['src'],
|
|
#'university': '',
|
|
#'speaker': '',
|
|
} for node in nodes]
|
|
|
|
return items
|
|
|
|
|
|
def get_lectures(html):
|
|
return _get_courses_or_lectures('lecture', html)
|
|
|
|
|
|
def get_courses(html):
|
|
return _get_courses_or_lectures('course', html)
|
|
|
|
|
|
def get_course_metadata(course_url):
|
|
html = _html(make_showall_url(course_url))
|
|
lectures = get_lectures(html)
|
|
name = get_course_name(html)
|
|
return {
|
|
'lectures': lectures,
|
|
'name': name,
|
|
}
|
|
|
|
|
|
def get_lecture_metadata(lecture_url):
|
|
html = _html(lecture_url)
|
|
name = get_lecture_name(html)
|
|
youtube_id = parse_youtube_id(html)
|
|
return {
|
|
'name': name,
|
|
'youtube_id': youtube_id
|
|
}
|
|
|
|
|
|
|
|
def parse_youtube_id(html):
|
|
embed = html.find('embed')
|
|
yt_ptn = re.compile(r'http://www.youtube.com/v/(.+?)\?')
|
|
match = yt_ptn.search(embed['src'])
|
|
if match:
|
|
return match.group(1)
|
|
return None
|