summaryrefslogtreecommitdiff
path: root/dotfiles/system/.config/calibre/custom_recipes/The New York Times_1000.recipe
blob: 63df3fd4cb53704f57e9e6b2e906c60f904971b1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import datetime
import json
import re

import mechanize

from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe
from polyglot.urllib import urlencode

is_web_edition = False
use_wayback_machine = False

# This is an Apollo persisted query hash which you can get
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
# or by https://www.nytimes.com/section/world
persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7'

# The sections to download when downloading the web edition, comment out
# the section you are not interested in
web_sections = [
    ('World', 'world'),
    ('U.S.', 'us'),
    ('Politics', 'politics'),
    # ('New York', 'nyregion'),
    ('Business', 'business'),
    ('Technology', 'technology'),
    # ('Sports', 'sports'),
    ('Science', 'science'),
    ('Health', 'health'),
    ('Opinion', 'opinion'),
    # ('Arts', 'arts'),
    # ('Books', 'books'),
    # ('Movies', 'movies'),
    # ('Music', 'arts/music'),
    # ('Television', 'arts/television'),
    # ('Style', 'style'),
    # ('Dining & Wine', 'food'),
    # ('Fashion & Style', 'fashion'),
    # ('Home & Garden', 'garden'),
    ('Travel', 'travel'),
    # ('Education', 'education'),
    #  ('Multimedia', 'multimedia'),
    # ('Obituaries', 'obituaries'),
    # ('Sunday Magazine', 'magazine')
]
# web_sections = [ ('Business', 'business'), ]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')


def date_from_url(url):
    m = url_date_pat.search(url)
    if m is not None:
        return datetime.date(*map(int, m.groups()))


def format_date(d):
    try:
        return strftime(' [%a, %d %b %Y]', d)
    except Exception:
        return strftime(' [%Y/%m/%d]', d)


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class NewYorkTimes(BasicNewsRecipe):
    if is_web_edition:
        title = 'The New York Times (Web)'
        description = (
            'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
            'Use advanced menu to make changes to fetch Todays Paper'
        )
    else:
        title = 'The New York Times'
        description = (
            'New York Times. Todays Paper '
            'Use advanced menu to make changes to fetch Web Edition'
        )
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en_US'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    oldest_web_edition_article = 7  # days

    extra_css = '''
        .byl, .time { font-size:small; color:#202020; }
        .cap { font-size:small; text-align:center; }
        .cred { font-style:italic; font-size:small; }
        em, blockquote { color: #202020; }
        .sc { font-variant: small-caps; }
        .lbl { font-size:small; color:#404040; }
        img { display:block; margin:0 auto; }
    '''

    @property
    def nyt_parser(self):
        ans = getattr(self, '_nyt_parser', None)
        if ans is None:
            from calibre.live import load_module
            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
        return ans

    def get_nyt_page(self, url, skip_wayback=False):
        if use_wayback_machine and not skip_wayback:
            from calibre import browser
            return self.nyt_parser.download_url(url, browser())
        return self.index_to_soup(url, raw=True)

    def preprocess_raw_html(self, raw_html, url):
        cleaned = self.nyt_parser.clean_js_json(raw_html)
        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)

    articles_are_obfuscated = use_wayback_machine

    if use_wayback_machine:
        def get_obfuscated_article(self, url):
            from calibre.ptempfile import PersistentTemporaryFile
            with PersistentTemporaryFile() as tf:
                tf.write(self.get_nyt_page(url))
            return tf.name

    recipe_specific_options = {
        'web': {
            'short': 'Type in yes, if you want ' + ('Todays Paper' if is_web_edition else 'Web Edition'),
            'default': 'Web Edition' if is_web_edition else 'Todays Paper',
        },
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
            'default': str(oldest_web_edition_article)
        },
        'date': {
            'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
            'long': 'For example, 2024/07/16'
        },
        'res': {
            'short': (
                'For hi-res images, select a resolution from the following\noptions: '
                'popup, jumbo, mobileMasterAt3x, superJumbo'
            ),
            'long': (
                'This is useful for non e-ink devices, and for a lower file size\nthan '
                'the default, use mediumThreeByTwo440, mediumThreeByTwo225, articleInline.'
            ),
        },
        'comp': {
            'short': 'Compress News Images?',
            'long': 'enter yes',
            'default': 'no'
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        c = self.recipe_specific_options.get('comp')
        d = self.recipe_specific_options.get('days')
        w = self.recipe_specific_options.get('web')
        self.is_web_edition = is_web_edition
        if w and isinstance(w, str):
            if w == 'yes':
                self.is_web_edition = not is_web_edition
        if d and isinstance(d, str):
            self.oldest_web_edition_article = float(d)
        if c and isinstance(c, str):
            if c.lower() == 'yes':
                self.compress_news_images = True

    def read_todays_paper(self):
        pdate = self.recipe_specific_options.get('date')
        templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times'
        if pdate and isinstance(pdate, str):
            return pdate, self.index_to_soup(templ.format(pdate))
        # Cant figure out how to get the date so just try todays and yesterdays dates
        date = datetime.date.today()
        pdate = date.strftime('%Y/%m/%d')
        try:
            soup = self.index_to_soup(templ.format(pdate))
        except Exception as e:
            if getattr(e, 'code', None) == 404:
                date -= datetime.timedelta(days=1)
                pdate = date.strftime('%Y/%m/%d')
                soup = self.index_to_soup(templ.format(pdate))
            else:
                raise
        self.log("Using today's paper from:", pdate)
        return pdate, soup

    def read_nyt_metadata(self):
        pdate, soup = self.read_todays_paper()
        date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
        self.timefmt = strftime(' [%d %b, %Y]', date)
        self.nytimes_publication_date = pdate
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
        clean_json = self.nyt_parser.clean_js_json(raw_json)
        self.nytimes_graphql_config = json.loads(clean_json)['config']
        return soup

    def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
        query = {
            'operationName': operationName,
            'variables': json.dumps({
                'id': qid,
                'first': 10,
                'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
                'isFetchMore':False,
                'isTranslatable':False,
                'isEspanol':False,
                'highlightsListUri':'nyt://per/personalized-list/__null__',
                'highlightsListFirst':0,
                'hasHighlightsList':False
            }, separators=',:'),
            'extensions': json.dumps({
                'persistedQuery': {
                    'version':1,
                    'sha256Hash': persistedQuery,
                },
            }, separators=',:')
        }
        url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query)
        br = self.browser
        # br.set_debug_http(True)
        headers = dict(self.nytimes_graphql_config['gqlRequestHeaders'])
        headers['Accept'] = 'application/json'
        req = mechanize.Request(url, headers=headers)
        raw = br.open(req).read()
        # open('/t/raw.json', 'wb').write(raw)
        return json.loads(raw)

    def parse_todays_page(self):
        self.read_nyt_metadata()
        query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date)
        data = self.nyt_graphql_query(query_id)
        return parse_todays_page(data, self.log)

    def parse_web_sections(self):
        self.read_nyt_metadata()
        feeds = []
        for section_title, slug in web_sections:
            query_id = '/section/' + slug
            try:
                data = self.nyt_graphql_query(query_id)
                self.log('Section:', section_title)
                articles = parse_web_section(data, log=self.log, title=section_title)
            except Exception as e:
                self.log('Failed to parse section:', section_title, 'with error:', e)
                articles = []
            if articles:
                feeds.append((section_title, articles))
            else:
                # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
                self.log('  No articles found in section:', section_title)
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds

    def parse_index(self):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
        # ])]
        if self.is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)'
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        return br

    def preprocess_html(self, soup):
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '-' + w
            for img in soup.findAll('img', attrs={'src':True}):
                if '-article' in img['src']:
                    ext = img['src'].split('?')[0].split('.')[-1]
                    img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
        for c in soup.findAll('div', attrs={'class':'cap'}):
            for p in c.findAll(['p', 'div']):
                p.name = 'span'
        return soup

    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        if not re.search(r'/video/|/athletic/|/card/', url):
            return url
        self.log('\tSkipping ', url)


def asset_to_article(asset):
    title = asset['headline']['default']
    return {'title': title, 'url': asset['url'], 'description': asset['summary']}


def parse_todays_page(data, log=print):
    containers = data['data']['legacyCollection']['groupings'][0]['containers']
    feeds = []
    for cont in containers:
        if cont['__typename'] != 'LegacyCollectionContainer':
            continue
        section_name = cont['label'].strip()
        if not section_name:
            continue
        log(section_name)
        articles = []
        for rel in cont['relations']:
            if rel.get('__typename') == 'LegacyCollectionRelation':
                asset = rel['asset']
                if asset['__typename'] == 'Article':
                    articles.append(asset_to_article(asset))
                    log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
        if articles:
            feeds.append((section_name, articles))
    return feeds


def parse_web_section(data, log=print, title=''):
    articles = []
    try:
        containers = data['data']['legacyCollection']['collectionsPage']
        if containers.get('embeddedCollections'):
            containers = containers['embeddedCollections']
        else:
            containers = [containers]
    except Exception as e:
        log('Failed to parse web section', title, 'with error:', e)
        return articles
    for cont in containers:
        for s in cont['stream']['edges']:
            asset = s['node']
            if asset['__typename'] == 'Article':
                articles.append(asset_to_article(asset))
                log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
    return articles


if __name__ == '__main__':
    import sys
    data = json.loads(open(sys.argv[-1], 'rb').read())
    if is_web_edition:
        parse_web_section(data)
    else:
        parse_todays_page(data)


calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'