diff options
Diffstat (limited to 'dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe')
| -rw-r--r-- | dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe | 368 |
1 files changed, 0 insertions, 368 deletions
diff --git a/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe b/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe deleted file mode 100644 index db3c40b..0000000 --- a/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net> - -from __future__ import absolute_import, division, print_function, unicode_literals - -import datetime -import json -import re - -import mechanize - -from calibre import strftime -from calibre.ebooks.BeautifulSoup import Tag -from calibre.utils.date import strptime -from calibre.web.feeds.news import BasicNewsRecipe -from polyglot.urllib import urlencode - -is_web_edition = True -use_wayback_machine = False - -# This is an Apollo persisted query hash which you can get -# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper -# or by https://www.nytimes.com/section/world -persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7' - -# The sections to download when downloading the web edition, comment out -# the section you are not interested in -web_sections = [ - ('World', 'world'), - ('U.S.', 'us'), - ('Politics', 'politics'), - # ('New York', 'nyregion'), - ('Business', 'business'), - ('Technology', 'technology'), - ('Sports', 'sports'), - ('Science', 'science'), - ('Health', 'health'), - ('Opinion', 'opinion'), - # ('Arts', 'arts'), - ('Books', 'books'), - # ('Movies', 'movies'), - # ('Music', 'arts/music'), - # ('Television', 'arts/television'), - # ('Style', 'style'), - # ('Dining & Wine', 'food'), - # ('Fashion & Style', 'fashion'), - # ('Home & Garden', 'garden'), - # ('Travel', 'travel'), - ('Education', 'education'), - # ('Multimedia', 'multimedia'), - ('Obituaries', 'obituaries'), - # ('Sunday Magazine', 'magazine') -] -# web_sections = [ ('Business', 'business'), ] -url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') - - -def date_from_url(url): - m = url_date_pat.search(url) - if m is not None: - return datetime.date(*map(int, m.groups())) - - -def format_date(d): - try: - return strftime(' [%a, %d %b %Y]', d) - except Exception: - return strftime(' [%Y/%m/%d]', d) - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - -class NewYorkTimes(BasicNewsRecipe): - if is_web_edition: - title = 'The New York Times (Web)' - description = ( - 'New York Times (Web). You can edit the recipe to remove sections you are not interested in. ' - 'Use advanced menu to make changes to fetch Todays Paper' - ) - else: - title = 'The New York Times' - description = ( - 'New York Times. Todays Paper ' - 'Use advanced menu to make changes to fetch Web Edition' - ) - encoding = 'utf-8' - __author__ = 'Kovid Goyal' - language = 'en_US' - ignore_duplicate_articles = {'title', 'url'} - no_stylesheets = True - oldest_web_edition_article = 1 # days - - extra_css = ''' - .byl, .time { font-size:small; color:#202020; } - .cap { font-size:small; text-align:center; } - .cred { font-style:italic; font-size:small; } - em, blockquote { color: #202020; } - .sc { font-variant: small-caps; } - .lbl { font-size:small; color:#404040; } - img { display:block; margin:0 auto; } - ''' - - @property - def nyt_parser(self): - ans = getattr(self, '_nyt_parser', None) - if ans is None: - from calibre.live import load_module - self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') - return ans - - def get_nyt_page(self, url, skip_wayback=False): - if use_wayback_machine and not skip_wayback: - from calibre import browser - return self.nyt_parser.download_url(url, browser()) - return self.index_to_soup(url, raw=True) - - def preprocess_raw_html(self, raw_html, url): - cleaned = self.nyt_parser.clean_js_json(raw_html) - return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) - - articles_are_obfuscated = use_wayback_machine - - if use_wayback_machine: - def get_obfuscated_article(self, url): - from calibre.ptempfile import PersistentTemporaryFile - with PersistentTemporaryFile() as tf: - tf.write(self.get_nyt_page(url)) - return tf.name - - recipe_specific_options = { - 'web': { - 'short': 'Type in yes, if you want ' + ('Todays Paper' if is_web_edition else 'Web Edition'), - 'default': 'Web Edition' if is_web_edition else 'Todays Paper', - }, - 'days': { - 'short': 'Oldest article to download from this news source. In days ', - 'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)', - 'default': str(oldest_web_edition_article) - }, - 'date': { - 'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper', - 'long': 'For example, 2024/07/16' - }, - 'res': { - 'short': ( - 'For hi-res images, select a resolution from the following\noptions: ' - 'popup, jumbo, mobileMasterAt3x, superJumbo' - ), - 'long': ( - 'This is useful for non e-ink devices, and for a lower file size\nthan ' - 'the default, use mediumThreeByTwo440, mediumThreeByTwo225, articleInline.' - ), - }, - 'comp': { - 'short': 'Compress News Images?', - 'long': 'enter yes', - 'default': 'no' - } - } - - def __init__(self, *args, **kwargs): - BasicNewsRecipe.__init__(self, *args, **kwargs) - c = self.recipe_specific_options.get('comp') - d = self.recipe_specific_options.get('days') - w = self.recipe_specific_options.get('web') - self.is_web_edition = is_web_edition - if w and isinstance(w, str): - if w == 'yes': - self.is_web_edition = not is_web_edition - if d and isinstance(d, str): - self.oldest_web_edition_article = float(d) - if c and isinstance(c, str): - if c.lower() == 'yes': - self.compress_news_images = True - - def read_todays_paper(self): - pdate = self.recipe_specific_options.get('date') - templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times' - if pdate and isinstance(pdate, str): - return pdate, self.index_to_soup(templ.format(pdate)) - # Cant figure out how to get the date so just try todays and yesterdays dates - date = datetime.date.today() - pdate = date.strftime('%Y/%m/%d') - try: - soup = self.index_to_soup(templ.format(pdate)) - except Exception as e: - if getattr(e, 'code', None) == 404: - date -= datetime.timedelta(days=1) - pdate = date.strftime('%Y/%m/%d') - soup = self.index_to_soup(templ.format(pdate)) - else: - raise - self.log("Using today's paper from:", pdate) - return pdate, soup - - def read_nyt_metadata(self): - pdate, soup = self.read_todays_paper() - date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False) - self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) - self.timefmt = strftime(' [%d %b, %Y]', date) - self.nytimes_publication_date = pdate - script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] - script = type(u'')(script) - raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - clean_json = self.nyt_parser.clean_js_json(raw_json) - self.nytimes_graphql_config = json.loads(clean_json)['config'] - return soup - - def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): - query = { - 'operationName': operationName, - 'variables': json.dumps({ - 'id': qid, - 'first': 10, - 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED', - 'isFetchMore':False, - 'isTranslatable':False, - 'isEspanol':False, - 'highlightsListUri':'nyt://per/personalized-list/__null__', - 'highlightsListFirst':0, - 'hasHighlightsList':False - }, separators=',:'), - 'extensions': json.dumps({ - 'persistedQuery': { - 'version':1, - 'sha256Hash': persistedQuery, - }, - }, separators=',:') - } - url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query) - br = self.browser - # br.set_debug_http(True) - headers = dict(self.nytimes_graphql_config['gqlRequestHeaders']) - headers['Accept'] = 'application/json' - req = mechanize.Request(url, headers=headers) - raw = br.open(req).read() - # open('/t/raw.json', 'wb').write(raw) - return json.loads(raw) - - def parse_todays_page(self): - self.read_nyt_metadata() - query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date) - data = self.nyt_graphql_query(query_id) - return parse_todays_page(data, self.log) - - def parse_web_sections(self): - self.read_nyt_metadata() - feeds = [] - for section_title, slug in web_sections: - query_id = '/section/' + slug - try: - data = self.nyt_graphql_query(query_id) - self.log('Section:', section_title) - articles = parse_web_section(data, log=self.log, title=section_title) - except Exception as e: - self.log('Failed to parse section:', section_title, 'with error:', e) - articles = [] - if articles: - feeds.append((section_title, articles)) - else: - # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) - self.log(' No articles found in section:', section_title) - if self.test and len(feeds) >= self.test[0]: - break - return feeds - - def parse_index(self): - # return [('All articles', [ - # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, - # ])] - if self.is_web_edition: - return self.parse_web_sections() - return self.parse_todays_page() - - def get_browser(self, *args, **kwargs): - kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)' - br = BasicNewsRecipe.get_browser(self, *args, **kwargs) - return br - - def preprocess_html(self, soup): - w = self.recipe_specific_options.get('res') - if w and isinstance(w, str): - res = '-' + w - for img in soup.findAll('img', attrs={'src':True}): - if '-article' in img['src']: - ext = img['src'].split('?')[0].split('.')[-1] - img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext - for c in soup.findAll('div', attrs={'class':'cap'}): - for p in c.findAll(['p', 'div']): - p.name = 'span' - return soup - - def get_article_url(self, article): - url = BasicNewsRecipe.get_article_url(self, article) - if not re.search(r'/video/|/athletic/|/card/', url): - return url - self.log('\tSkipping ', url) - - -def asset_to_article(asset): - title = asset['headline']['default'] - return {'title': title, 'url': asset['url'], 'description': asset['summary']} - - -def parse_todays_page(data, log=print): - containers = data['data']['legacyCollection']['groupings'][0]['containers'] - feeds = [] - for cont in containers: - if cont['__typename'] != 'LegacyCollectionContainer': - continue - section_name = cont['label'].strip() - if not section_name: - continue - log(section_name) - articles = [] - for rel in cont['relations']: - if rel.get('__typename') == 'LegacyCollectionRelation': - asset = rel['asset'] - if asset['__typename'] == 'Article': - articles.append(asset_to_article(asset)) - log(' ', articles[-1]['title'] + ':', articles[-1]['url']) - if articles: - feeds.append((section_name, articles)) - return feeds - - -def parse_web_section(data, log=print, title=''): - articles = [] - try: - containers = data['data']['legacyCollection']['collectionsPage'] - if containers.get('embeddedCollections'): - containers = containers['embeddedCollections'] - else: - containers = [containers] - except Exception as e: - log('Failed to parse web section', title, 'with error:', e) - return articles - for cont in containers: - for s in cont['stream']['edges']: - asset = s['node'] - if asset['__typename'] == 'Article': - articles.append(asset_to_article(asset)) - log(' ', articles[-1]['title'] + ':', articles[-1]['url']) - return articles - - -if __name__ == '__main__': - import sys - data = json.loads(open(sys.argv[-1], 'rb').read()) - if is_web_edition: - parse_web_section(data) - else: - parse_todays_page(data) - - -calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' |
