#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2018, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import datetime import json import re import mechanize from calibre import strftime from calibre.ebooks.BeautifulSoup import Tag from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe from polyglot.urllib import urlencode is_web_edition = False use_wayback_machine = False # This is an Apollo persisted query hash which you can get # from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper # or by https://www.nytimes.com/section/world persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7' # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ ('World', 'world'), ('U.S.', 'us'), ('Politics', 'politics'), # ('New York', 'nyregion'), ('Business', 'business'), ('Technology', 'technology'), # ('Sports', 'sports'), ('Science', 'science'), ('Health', 'health'), ('Opinion', 'opinion'), # ('Arts', 'arts'), # ('Books', 'books'), # ('Movies', 'movies'), # ('Music', 'arts/music'), # ('Television', 'arts/television'), # ('Style', 'style'), # ('Dining & Wine', 'food'), # ('Fashion & Style', 'fashion'), # ('Home & Garden', 'garden'), ('Travel', 'travel'), # ('Education', 'education'), # ('Multimedia', 'multimedia'), # ('Obituaries', 'obituaries'), # ('Sunday Magazine', 'magazine') ] # web_sections = [ ('Business', 'business'), ] url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') def date_from_url(url): m = url_date_pat.search(url) if m is not None: return datetime.date(*map(int, m.groups())) def format_date(d): try: return strftime(' [%a, %d %b %Y]', d) except Exception: return strftime(' [%Y/%m/%d]', d) def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class NewYorkTimes(BasicNewsRecipe): if is_web_edition: title = 'The New York Times (Web)' description = ( 'New York Times (Web). You can edit the recipe to remove sections you are not interested in. ' 'Use advanced menu to make changes to fetch Todays Paper' ) else: title = 'The New York Times' description = ( 'New York Times. Todays Paper ' 'Use advanced menu to make changes to fetch Web Edition' ) encoding = 'utf-8' __author__ = 'Kovid Goyal' language = 'en_US' ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True oldest_web_edition_article = 7 # days extra_css = ''' .byl, .time { font-size:small; color:#202020; } .cap { font-size:small; text-align:center; } .cred { font-style:italic; font-size:small; } em, blockquote { color: #202020; } .sc { font-variant: small-caps; } .lbl { font-size:small; color:#404040; } img { display:block; margin:0 auto; } ''' @property def nyt_parser(self): ans = getattr(self, '_nyt_parser', None) if ans is None: from calibre.live import load_module self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') return ans def get_nyt_page(self, url, skip_wayback=False): if use_wayback_machine and not skip_wayback: from calibre import browser return self.nyt_parser.download_url(url, browser()) return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): cleaned = self.nyt_parser.clean_js_json(raw_html) return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) articles_are_obfuscated = use_wayback_machine if use_wayback_machine: def get_obfuscated_article(self, url): from calibre.ptempfile import PersistentTemporaryFile with PersistentTemporaryFile() as tf: tf.write(self.get_nyt_page(url)) return tf.name recipe_specific_options = { 'web': { 'short': 'Type in yes, if you want ' + ('Todays Paper' if is_web_edition else 'Web Edition'), 'default': 'Web Edition' if is_web_edition else 'Todays Paper', }, 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)', 'default': str(oldest_web_edition_article) }, 'date': { 'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper', 'long': 'For example, 2024/07/16' }, 'res': { 'short': ( 'For hi-res images, select a resolution from the following\noptions: ' 'popup, jumbo, mobileMasterAt3x, superJumbo' ), 'long': ( 'This is useful for non e-ink devices, and for a lower file size\nthan ' 'the default, use mediumThreeByTwo440, mediumThreeByTwo225, articleInline.' ), }, 'comp': { 'short': 'Compress News Images?', 'long': 'enter yes', 'default': 'no' } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) c = self.recipe_specific_options.get('comp') d = self.recipe_specific_options.get('days') w = self.recipe_specific_options.get('web') self.is_web_edition = is_web_edition if w and isinstance(w, str): if w == 'yes': self.is_web_edition = not is_web_edition if d and isinstance(d, str): self.oldest_web_edition_article = float(d) if c and isinstance(c, str): if c.lower() == 'yes': self.compress_news_images = True def read_todays_paper(self): pdate = self.recipe_specific_options.get('date') templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times' if pdate and isinstance(pdate, str): return pdate, self.index_to_soup(templ.format(pdate)) # Cant figure out how to get the date so just try todays and yesterdays dates date = datetime.date.today() pdate = date.strftime('%Y/%m/%d') try: soup = self.index_to_soup(templ.format(pdate)) except Exception as e: if getattr(e, 'code', None) == 404: date -= datetime.timedelta(days=1) pdate = date.strftime('%Y/%m/%d') soup = self.index_to_soup(templ.format(pdate)) else: raise self.log("Using today's paper from:", pdate) return pdate, soup def read_nyt_metadata(self): pdate, soup = self.read_todays_paper() date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False) self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) self.timefmt = strftime(' [%d %b, %Y]', date) self.nytimes_publication_date = pdate script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} clean_json = self.nyt_parser.clean_js_json(raw_json) self.nytimes_graphql_config = json.loads(clean_json)['config'] return soup def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): query = { 'operationName': operationName, 'variables': json.dumps({ 'id': qid, 'first': 10, 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED', 'isFetchMore':False, 'isTranslatable':False, 'isEspanol':False, 'highlightsListUri':'nyt://per/personalized-list/__null__', 'highlightsListFirst':0, 'hasHighlightsList':False }, separators=',:'), 'extensions': json.dumps({ 'persistedQuery': { 'version':1, 'sha256Hash': persistedQuery, }, }, separators=',:') } url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query) br = self.browser # br.set_debug_http(True) headers = dict(self.nytimes_graphql_config['gqlRequestHeaders']) headers['Accept'] = 'application/json' req = mechanize.Request(url, headers=headers) raw = br.open(req).read() # open('/t/raw.json', 'wb').write(raw) return json.loads(raw) def parse_todays_page(self): self.read_nyt_metadata() query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date) data = self.nyt_graphql_query(query_id) return parse_todays_page(data, self.log) def parse_web_sections(self): self.read_nyt_metadata() feeds = [] for section_title, slug in web_sections: query_id = '/section/' + slug try: data = self.nyt_graphql_query(query_id) self.log('Section:', section_title) articles = parse_web_section(data, log=self.log, title=section_title) except Exception as e: self.log('Failed to parse section:', section_title, 'with error:', e) articles = [] if articles: feeds.append((section_title, articles)) else: # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) self.log(' No articles found in section:', section_title) if self.test and len(feeds) >= self.test[0]: break return feeds def parse_index(self): # return [('All articles', [ # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # ])] if self.is_web_edition: return self.parse_web_sections() return self.parse_todays_page() def get_browser(self, *args, **kwargs): kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)' br = BasicNewsRecipe.get_browser(self, *args, **kwargs) return br def preprocess_html(self, soup): w = self.recipe_specific_options.get('res') if w and isinstance(w, str): res = '-' + w for img in soup.findAll('img', attrs={'src':True}): if '-article' in img['src']: ext = img['src'].split('?')[0].split('.')[-1] img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext for c in soup.findAll('div', attrs={'class':'cap'}): for p in c.findAll(['p', 'div']): p.name = 'span' return soup def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if not re.search(r'/video/|/athletic/|/card/', url): return url self.log('\tSkipping ', url) def asset_to_article(asset): title = asset['headline']['default'] return {'title': title, 'url': asset['url'], 'description': asset['summary']} def parse_todays_page(data, log=print): containers = data['data']['legacyCollection']['groupings'][0]['containers'] feeds = [] for cont in containers: if cont['__typename'] != 'LegacyCollectionContainer': continue section_name = cont['label'].strip() if not section_name: continue log(section_name) articles = [] for rel in cont['relations']: if rel.get('__typename') == 'LegacyCollectionRelation': asset = rel['asset'] if asset['__typename'] == 'Article': articles.append(asset_to_article(asset)) log(' ', articles[-1]['title'] + ':', articles[-1]['url']) if articles: feeds.append((section_name, articles)) return feeds def parse_web_section(data, log=print, title=''): articles = [] try: containers = data['data']['legacyCollection']['collectionsPage'] if containers.get('embeddedCollections'): containers = containers['embeddedCollections'] else: containers = [containers] except Exception as e: log('Failed to parse web section', title, 'with error:', e) return articles for cont in containers: for s in cont['stream']['edges']: asset = s['node'] if asset['__typename'] == 'Article': articles.append(asset_to_article(asset)) log(' ', articles[-1]['title'] + ':', articles[-1]['url']) return articles if __name__ == '__main__': import sys data = json.loads(open(sys.argv[-1], 'rb').read()) if is_web_edition: parse_web_section(data) else: parse_todays_page(data) calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'