repository.olipassey/plugin.video.londonlive/resources/lib/academicearth/scraper.py

'''
    academicearth.scraper
    ~~~~~~~~~~~~~~~~~~~~~

    This module contains some functions which do the website scraping for the
    API module. You shouldn't have to use this module directly.
'''
import re
from urllib2 import urlopen
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup as BS


BASE_URL = 'http://www.academicearth.org'
def _url(path):
    '''Returns a full url for the given path'''
    return urljoin(BASE_URL, path)


def get(url):
    '''Performs a GET request for the given url and returns the response'''
    conn = urlopen(url)
    resp = conn.read()
    conn.close()
    return resp


def _html(url):
    '''Downloads the resource at the given url and parses via BeautifulSoup'''
    return BS(get(url), convertEntities=BS.HTML_ENTITIES)


def make_showall_url(url):
    '''Takes an api url and appends info to the path to force the page to
    return all entries instead of paginating.
    '''
    if not url.endswith('/'):
        url += '/'
    return url + 'page:1/show:500'


def get_subjects():
    '''Returns a list of subjects for the website. Each subject is a dict with
    keys of 'name' and 'url'.
    '''
    url = _url('subjects')
    html = _html(url)
    subjs = html.findAll('a',
        {'href': lambda attr_value: attr_value.startswith('/subjects/')
                                    and len(attr_value) > len('/subjects/')})

    # subjs will contain some duplicates so we will key on url
    items = []
    urls = set()
    for subj in subjs:
        url = _url(subj['href'])
        if url not in urls:
            urls.add(url)
            items.append({
                'name': subj.string,
                'url': url,
            })

    # filter out any items that didn't parse correctly
    return [item for item in items if item['name'] and item['url']]


def get_subject_metadata(subject_url):
    '''Returns metadata for a subject parsed from the given url'''
    html = _html(make_showall_url(subject_url))
    name = get_subject_name(html)
    courses = get_courses(html)
    lectures = get_lectures(html)
    desc = get_subject_description(html)

    return {
        'name': name,
        'courses': courses,
        'lectures': lectures,
        'description': desc,
    }


def get_subject_name(html):
    return html.find('article').h1.text


def get_course_name(html):
    return html.find('section', {'class': 'pagenav'}).span.text


def get_lecture_name(html):
    return html.find('section', {'class': 'pagenav'}).span.text


def get_subject_description(html):
    desc_nodes = html.find('article').findAll('span')
    return '\n'.join(node.text.strip() for node in desc_nodes)


def _get_courses_or_lectures(class_type, html):
    '''class_type can be 'course' or 'lecture'.'''
    nodes = html.findAll('div', {'class': class_type})

    items = [{
        'name': node.h3.text,
        'url': _url(node.a['href']),
        'icon': node.img['src'],
        #'university':  '',
        #'speaker': '',
    } for node in nodes]

    return items


def get_lectures(html):
    return _get_courses_or_lectures('lecture', html)


def get_courses(html):
    return _get_courses_or_lectures('course', html)


def get_course_metadata(course_url):
    html = _html(make_showall_url(course_url))
    lectures = get_lectures(html)
    name = get_course_name(html)
    return {
        'lectures': lectures,
        'name': name,
    }


def get_lecture_metadata(lecture_url):
    html = _html(lecture_url)
    name = get_lecture_name(html)
    youtube_id = parse_youtube_id(html)
    return {
        'name': name,
        'youtube_id': youtube_id
    }


def parse_youtube_id(html):
    embed = html.find('embed')
    yt_ptn = re.compile(r'http://www.youtube.com/v/(.+?)\?')
    match = yt_ptn.search(embed['src'])
    if match:
        return match.group(1)
    return None