diff options
Diffstat (limited to 'dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe')
| -rw-r--r-- | dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe | 368 |
1 files changed, 368 insertions, 0 deletions
diff --git a/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe b/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe new file mode 100644 index 0000000..63df3fd --- /dev/null +++ b/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe @@ -0,0 +1,368 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net> + +from __future__ import absolute_import, division, print_function, unicode_literals + +import datetime +import json +import re + +import mechanize + +from calibre import strftime +from calibre.ebooks.BeautifulSoup import Tag +from calibre.utils.date import strptime +from calibre.web.feeds.news import BasicNewsRecipe +from polyglot.urllib import urlencode + +is_web_edition = False +use_wayback_machine = False + +# This is an Apollo persisted query hash which you can get +# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper +# or by https://www.nytimes.com/section/world +persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7' + +# The sections to download when downloading the web edition, comment out +# the section you are not interested in +web_sections = [ + ('World', 'world'), + ('U.S.', 'us'), + ('Politics', 'politics'), + # ('New York', 'nyregion'), + ('Business', 'business'), + ('Technology', 'technology'), + # ('Sports', 'sports'), + ('Science', 'science'), + ('Health', 'health'), + ('Opinion', 'opinion'), + # ('Arts', 'arts'), + # ('Books', 'books'), + # ('Movies', 'movies'), + # ('Music', 'arts/music'), + # ('Television', 'arts/television'), + # ('Style', 'style'), + # ('Dining & Wine', 'food'), + # ('Fashion & Style', 'fashion'), + # ('Home & Garden', 'garden'), + ('Travel', 'travel'), + # ('Education', 'education'), + # ('Multimedia', 'multimedia'), + # ('Obituaries', 'obituaries'), + # ('Sunday Magazine', 'magazine') +] +# web_sections = [ ('Business', 'business'), ] +url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') + + +def date_from_url(url): + m = url_date_pat.search(url) + if m is not None: + return datetime.date(*map(int, m.groups())) + + +def format_date(d): + try: + return strftime(' [%a, %d %b %Y]', d) + except Exception: + return strftime(' [%Y/%m/%d]', d) + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + +def new_tag(soup, name, attrs=()): + impl = getattr(soup, 'new_tag', None) + if impl is not None: + return impl(name, attrs=dict(attrs)) + return Tag(soup, name, attrs=attrs or None) + + +class NewYorkTimes(BasicNewsRecipe): + if is_web_edition: + title = 'The New York Times (Web)' + description = ( + 'New York Times (Web). You can edit the recipe to remove sections you are not interested in. ' + 'Use advanced menu to make changes to fetch Todays Paper' + ) + else: + title = 'The New York Times' + description = ( + 'New York Times. Todays Paper ' + 'Use advanced menu to make changes to fetch Web Edition' + ) + encoding = 'utf-8' + __author__ = 'Kovid Goyal' + language = 'en_US' + ignore_duplicate_articles = {'title', 'url'} + no_stylesheets = True + oldest_web_edition_article = 7 # days + + extra_css = ''' + .byl, .time { font-size:small; color:#202020; } + .cap { font-size:small; text-align:center; } + .cred { font-style:italic; font-size:small; } + em, blockquote { color: #202020; } + .sc { font-variant: small-caps; } + .lbl { font-size:small; color:#404040; } + img { display:block; margin:0 auto; } + ''' + + @property + def nyt_parser(self): + ans = getattr(self, '_nyt_parser', None) + if ans is None: + from calibre.live import load_module + self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') + return ans + + def get_nyt_page(self, url, skip_wayback=False): + if use_wayback_machine and not skip_wayback: + from calibre import browser + return self.nyt_parser.download_url(url, browser()) + return self.index_to_soup(url, raw=True) + + def preprocess_raw_html(self, raw_html, url): + cleaned = self.nyt_parser.clean_js_json(raw_html) + return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) + + articles_are_obfuscated = use_wayback_machine + + if use_wayback_machine: + def get_obfuscated_article(self, url): + from calibre.ptempfile import PersistentTemporaryFile + with PersistentTemporaryFile() as tf: + tf.write(self.get_nyt_page(url)) + return tf.name + + recipe_specific_options = { + 'web': { + 'short': 'Type in yes, if you want ' + ('Todays Paper' if is_web_edition else 'Web Edition'), + 'default': 'Web Edition' if is_web_edition else 'Todays Paper', + }, + 'days': { + 'short': 'Oldest article to download from this news source. In days ', + 'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)', + 'default': str(oldest_web_edition_article) + }, + 'date': { + 'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper', + 'long': 'For example, 2024/07/16' + }, + 'res': { + 'short': ( + 'For hi-res images, select a resolution from the following\noptions: ' + 'popup, jumbo, mobileMasterAt3x, superJumbo' + ), + 'long': ( + 'This is useful for non e-ink devices, and for a lower file size\nthan ' + 'the default, use mediumThreeByTwo440, mediumThreeByTwo225, articleInline.' + ), + }, + 'comp': { + 'short': 'Compress News Images?', + 'long': 'enter yes', + 'default': 'no' + } + } + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + c = self.recipe_specific_options.get('comp') + d = self.recipe_specific_options.get('days') + w = self.recipe_specific_options.get('web') + self.is_web_edition = is_web_edition + if w and isinstance(w, str): + if w == 'yes': + self.is_web_edition = not is_web_edition + if d and isinstance(d, str): + self.oldest_web_edition_article = float(d) + if c and isinstance(c, str): + if c.lower() == 'yes': + self.compress_news_images = True + + def read_todays_paper(self): + pdate = self.recipe_specific_options.get('date') + templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times' + if pdate and isinstance(pdate, str): + return pdate, self.index_to_soup(templ.format(pdate)) + # Cant figure out how to get the date so just try todays and yesterdays dates + date = datetime.date.today() + pdate = date.strftime('%Y/%m/%d') + try: + soup = self.index_to_soup(templ.format(pdate)) + except Exception as e: + if getattr(e, 'code', None) == 404: + date -= datetime.timedelta(days=1) + pdate = date.strftime('%Y/%m/%d') + soup = self.index_to_soup(templ.format(pdate)) + else: + raise + self.log("Using today's paper from:", pdate) + return pdate, soup + + def read_nyt_metadata(self): + pdate, soup = self.read_todays_paper() + date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False) + self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) + self.timefmt = strftime(' [%d %b, %Y]', date) + self.nytimes_publication_date = pdate + script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] + script = type(u'')(script) + raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} + clean_json = self.nyt_parser.clean_js_json(raw_json) + self.nytimes_graphql_config = json.loads(clean_json)['config'] + return soup + + def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): + query = { + 'operationName': operationName, + 'variables': json.dumps({ + 'id': qid, + 'first': 10, + 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED', + 'isFetchMore':False, + 'isTranslatable':False, + 'isEspanol':False, + 'highlightsListUri':'nyt://per/personalized-list/__null__', + 'highlightsListFirst':0, + 'hasHighlightsList':False + }, separators=',:'), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version':1, + 'sha256Hash': persistedQuery, + }, + }, separators=',:') + } + url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query) + br = self.browser + # br.set_debug_http(True) + headers = dict(self.nytimes_graphql_config['gqlRequestHeaders']) + headers['Accept'] = 'application/json' + req = mechanize.Request(url, headers=headers) + raw = br.open(req).read() + # open('/t/raw.json', 'wb').write(raw) + return json.loads(raw) + + def parse_todays_page(self): + self.read_nyt_metadata() + query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date) + data = self.nyt_graphql_query(query_id) + return parse_todays_page(data, self.log) + + def parse_web_sections(self): + self.read_nyt_metadata() + feeds = [] + for section_title, slug in web_sections: + query_id = '/section/' + slug + try: + data = self.nyt_graphql_query(query_id) + self.log('Section:', section_title) + articles = parse_web_section(data, log=self.log, title=section_title) + except Exception as e: + self.log('Failed to parse section:', section_title, 'with error:', e) + articles = [] + if articles: + feeds.append((section_title, articles)) + else: + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) + self.log(' No articles found in section:', section_title) + if self.test and len(feeds) >= self.test[0]: + break + return feeds + + def parse_index(self): + # return [('All articles', [ + # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, + # ])] + if self.is_web_edition: + return self.parse_web_sections() + return self.parse_todays_page() + + def get_browser(self, *args, **kwargs): + kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)' + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + return br + + def preprocess_html(self, soup): + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '-' + w + for img in soup.findAll('img', attrs={'src':True}): + if '-article' in img['src']: + ext = img['src'].split('?')[0].split('.')[-1] + img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext + for c in soup.findAll('div', attrs={'class':'cap'}): + for p in c.findAll(['p', 'div']): + p.name = 'span' + return soup + + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if not re.search(r'/video/|/athletic/|/card/', url): + return url + self.log('\tSkipping ', url) + + +def asset_to_article(asset): + title = asset['headline']['default'] + return {'title': title, 'url': asset['url'], 'description': asset['summary']} + + +def parse_todays_page(data, log=print): + containers = data['data']['legacyCollection']['groupings'][0]['containers'] + feeds = [] + for cont in containers: + if cont['__typename'] != 'LegacyCollectionContainer': + continue + section_name = cont['label'].strip() + if not section_name: + continue + log(section_name) + articles = [] + for rel in cont['relations']: + if rel.get('__typename') == 'LegacyCollectionRelation': + asset = rel['asset'] + if asset['__typename'] == 'Article': + articles.append(asset_to_article(asset)) + log(' ', articles[-1]['title'] + ':', articles[-1]['url']) + if articles: + feeds.append((section_name, articles)) + return feeds + + +def parse_web_section(data, log=print, title=''): + articles = [] + try: + containers = data['data']['legacyCollection']['collectionsPage'] + if containers.get('embeddedCollections'): + containers = containers['embeddedCollections'] + else: + containers = [containers] + except Exception as e: + log('Failed to parse web section', title, 'with error:', e) + return articles + for cont in containers: + for s in cont['stream']['edges']: + asset = s['node'] + if asset['__typename'] == 'Article': + articles.append(asset_to_article(asset)) + log(' ', articles[-1]['title'] + ':', articles[-1]['url']) + return articles + + +if __name__ == '__main__': + import sys + data = json.loads(open(sys.argv[-1], 'rb').read()) + if is_web_edition: + parse_web_section(data) + else: + parse_todays_page(data) + + +calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
\ No newline at end of file |
