Push all old addons

Push of all my old addons that are possibly well broken...
2017-12-03 19:26:03 +00:00
parent 4b3705e726
commit 671b609c17
57 changed files with 1385 additions and 1 deletions
--- a/plugin.video.londonlive/resources/lib/academicearth/scraper.py
+++ b/plugin.video.londonlive/resources/lib/academicearth/scraper.py
@@ -0,0 +1,151 @@
+'''
+    academicearth.scraper
+    ~~~~~~~~~~~~~~~~~~~~~
+
+    This module contains some functions which do the website scraping for the
+    API module. You shouldn't have to use this module directly.
+'''
+import re
+from urllib2 import urlopen
+from urlparse import urljoin
+from BeautifulSoup import BeautifulSoup as BS
+
+
+BASE_URL = 'http://www.academicearth.org'
+def _url(path):
+    '''Returns a full url for the given path'''            
+    return urljoin(BASE_URL, path)
+
+
+def get(url):
+    '''Performs a GET request for the given url and returns the response'''
+    conn = urlopen(url)
+    resp = conn.read()
+    conn.close()
+    return resp
+
+
+def _html(url):
+    '''Downloads the resource at the given url and parses via BeautifulSoup'''
+    return BS(get(url), convertEntities=BS.HTML_ENTITIES)
+
+
+def make_showall_url(url):
+    '''Takes an api url and appends info to the path to force the page to
+    return all entries instead of paginating.
+    '''
+    if not url.endswith('/'):
+        url += '/'
+    return url + 'page:1/show:500'
+
+
+def get_subjects():
+    '''Returns a list of subjects for the website. Each subject is a dict with
+    keys of 'name' and 'url'.
+    '''
+    url = _url('subjects')
+    html = _html(url)
+    subjs = html.findAll('a',
+        {'href': lambda attr_value: attr_value.startswith('/subjects/')
+                                    and len(attr_value) > len('/subjects/')})
+
+    # subjs will contain some duplicates so we will key on url
+    items = []
+    urls = set()
+    for subj in subjs:
+        url = _url(subj['href'])
+        if url not in urls:
+            urls.add(url)
+            items.append({
+                'name': subj.string,
+                'url': url,
+            })
+
+    # filter out any items that didn't parse correctly
+    return [item for item in items if item['name'] and item['url']]
+
+
+def get_subject_metadata(subject_url):
+    '''Returns metadata for a subject parsed from the given url'''
+    html = _html(make_showall_url(subject_url))
+    name = get_subject_name(html)
+    courses = get_courses(html)
+    lectures = get_lectures(html)
+    desc = get_subject_description(html)
+
+    return {
+        'name': name,
+        'courses': courses,
+        'lectures': lectures,
+        'description': desc,
+    }
+
+
+def get_subject_name(html):
+    return html.find('article').h1.text
+
+
+def get_course_name(html):
+    return html.find('section', {'class': 'pagenav'}).span.text
+
+
+def get_lecture_name(html):
+    return html.find('section', {'class': 'pagenav'}).span.text
+
+
+def get_subject_description(html):
+    desc_nodes = html.find('article').findAll('span')
+    return '\n'.join(node.text.strip() for node in desc_nodes)
+    
+
+def _get_courses_or_lectures(class_type, html):
+    '''class_type can be 'course' or 'lecture'.'''
+    nodes = html.findAll('div', {'class': class_type})
+
+    items = [{
+        'name': node.h3.text,
+        'url': _url(node.a['href']),
+        'icon': node.img['src'],
+        #'university':  '',
+        #'speaker': '',
+    } for node in nodes]
+
+    return items
+
+
+def get_lectures(html):
+    return _get_courses_or_lectures('lecture', html)
+
+
+def get_courses(html):
+    return _get_courses_or_lectures('course', html)
+
+
+def get_course_metadata(course_url):
+    html = _html(make_showall_url(course_url))
+    lectures = get_lectures(html)
+    name = get_course_name(html)
+    return {
+        'lectures': lectures,
+        'name': name,
+    }
+
+
+def get_lecture_metadata(lecture_url):
+    html = _html(lecture_url)
+    name = get_lecture_name(html)
+    youtube_id = parse_youtube_id(html)
+    return {
+        'name': name,
+        'youtube_id': youtube_id        
+    }
+    
+
+
+def parse_youtube_id(html):
+    embed = html.find('embed')
+    yt_ptn = re.compile(r'http://www.youtube.com/v/(.+?)\?')
+    match = yt_ptn.search(embed['src'])
+    if match:
+        return match.group(1)
+    return None