| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 | {
  "amazon": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport re\nimport socket\nimport string\nimport time\nfrom functools import partial\n\ntry:\n    from queue import Empty, Queue\nexcept ImportError:\n    from Queue import Empty, Queue\n\nfrom threading import Thread\n\ntry:\n    from urllib.parse import urlparse\nexcept ImportError:\n    from urlparse import urlparse\n\nfrom mechanize import HTTPError\n\nfrom calibre import as_unicode, browser, random_user_agent, xml_replace_entities\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase\nfrom calibre.utils.icu import lower as icu_lower\nfrom calibre.utils.localization import canonicalize_lang\nfrom calibre.utils.random_ua import accept_header_for_ua\n\n\ndef sort_matches_preferring_kindle_editions(matches):\n    upos_map = {url:i for i, url in enumerate(matches)}\n\n    def skey(url):\n        opos = upos_map[url]\n        parts = url.split('/')\n        try:\n            idx = parts.index('dp')\n        except Exception:\n            idx = -1\n        if idx < 0 or idx + 1 >= len(parts) or not parts[idx+1].startswith('B'):\n            return 1, opos\n        return 0, opos\n    matches.sort(key=skey)\n    return matches\n\n\ndef iri_quote_plus(url):\n    from calibre.ebooks.oeb.base import urlquote\n    ans = urlquote(url)\n    if isinstance(ans, bytes):\n        ans = ans.decode('utf-8')\n    return ans.replace('%20', '+')\n\n\ndef user_agent_is_ok(ua):\n    return 'Mobile/' not in ua and 'Mobile ' not in ua\n\n\nclass CaptchaError(Exception):\n    pass\n\n\nclass SearchFailed(ValueError):\n    pass\n\n\nclass UrlNotFound(ValueError):\n\n    def __init__(self, url):\n        ValueError.__init__(self, 'The URL {} was not found (HTTP 404)'.format(url))\n\n\nclass UrlTimedOut(ValueError):\n\n    def __init__(self, url):\n        ValueError.__init__(self, 'Timed out fetching {} try again later'.format(url))\n\n\ndef parse_html(raw):\n    try:\n        from html5_parser import parse\n    except ImportError:\n        # Old versions of calibre\n        import html5lib\n        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n    else:\n        return parse(raw)\n\n\ndef parse_details_page(url, log, timeout, browser, domain):\n    from lxml.html import tostring\n\n    from calibre.ebooks.chardet import xml_to_unicode\n    from calibre.utils.cleantext import clean_ascii_chars\n    try:\n        from calibre.ebooks.metadata.sources.update import search_engines_module\n        get_data_for_cached_url = search_engines_module().get_data_for_cached_url\n    except Exception:\n        def get_data_for_cached_url(*a):\n            return None\n    raw = get_data_for_cached_url(url)\n    if raw:\n        log('Using cached details for url:', url)\n    else:\n        log('Downloading details from:', url)\n        try:\n            raw = browser.open_novisit(url, timeout=timeout).read().strip()\n        except Exception as e:\n            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:\n                log.error('URL not found: %r' % url)\n                raise UrlNotFound(url)\n            attr = getattr(e, 'args', [None])\n            attr = attr if attr else [None]\n            if isinstance(attr[0], socket.timeout):\n                msg = 'Details page timed out. Try again later.'\n                log.error(msg)\n                raise UrlTimedOut(url)\n            else:\n                msg = 'Failed to make details query: %r' % url\n                log.exception(msg)\n                raise ValueError('Could not make details query for {}'.format(url))\n\n    oraw = raw\n    if 'amazon.com.br' in url:\n        # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag\n        raw = raw.decode('utf-8')\n    raw = xml_to_unicode(raw, strip_encoding_pats=True,\n                         resolve_entities=True)[0]\n    if '<title>404 - ' in raw:\n        raise ValueError('Got a 404 page for: %r' % url)\n    if '>Could not find the requested document in the cache.<' in raw:\n        raise ValueError('No cached entry for %s found' % url)\n\n    try:\n        root = parse_html(clean_ascii_chars(raw))\n    except Exception:\n        msg = 'Failed to parse amazon details page: %r' % url\n        log.exception(msg)\n        raise ValueError(msg)\n    if domain == 'jp':\n        for a in root.xpath('//a[@href]'):\n            if ('black-curtain-redirect.html' in a.get('href')) or ('/black-curtain/save-eligibility/black-curtain' in a.get('href')):\n                url = a.get('href')\n                if url:\n                    if url.startswith('/'):\n                        url = 'https://amazon.co.jp' + a.get('href')\n                    log('Black curtain redirect found, following')\n                    return parse_details_page(url, log, timeout, browser, domain)\n\n    errmsg = root.xpath('//*[@id=\"errorMessage\"]')\n    if errmsg:\n        msg = 'Failed to parse amazon details page: %r' % url\n        msg += tostring(errmsg, method='text', encoding='unicode').strip()\n        log.error(msg)\n        raise ValueError(msg)\n\n    from css_selectors import Select\n    selector = Select(root)\n    return oraw, root, selector\n\n\ndef parse_asin(root, log, url):\n    try:\n        link = root.xpath('//link[@rel=\"canonical\" and @href]')\n        for l in link:\n            return l.get('href').rpartition('/')[-1]\n    except Exception:\n        log.exception('Error parsing ASIN for url: %r' % url)\n\n\nclass Worker(Thread):  # Get details {{{\n\n    '''\n    Get book details from amazons book page in a separate thread\n    '''\n\n    def __init__(self, url, result_queue, browser, log, relevance, domain,\n                 plugin, timeout=20, testing=False, preparsed_root=None,\n                 cover_url_processor=None, filter_result=None):\n        Thread.__init__(self)\n        self.cover_url_processor = cover_url_processor\n        self.preparsed_root = preparsed_root\n        self.daemon = True\n        self.testing = testing\n        self.url, self.result_queue = url, result_queue\n        self.log, self.timeout = log, timeout\n        self.filter_result = filter_result or (lambda x, log: True)\n        self.relevance, self.plugin = relevance, plugin\n        self.browser = browser\n        self.cover_url = self.amazon_id = self.isbn = None\n        self.domain = domain\n        from lxml.html import tostring\n        self.tostring = tostring\n\n        months = {  # {{{\n            'de': {\n                1: ['jän', 'januar'],\n                2: ['februar'],\n                3: ['märz'],\n                5: ['mai'],\n                6: ['juni'],\n                7: ['juli'],\n                10: ['okt', 'oktober'],\n                12: ['dez', 'dezember']\n            },\n            'it': {\n                1: ['gennaio', 'enn'],\n                2: ['febbraio', 'febbr'],\n                3: ['marzo'],\n                4: ['aprile'],\n                5: ['maggio', 'magg'],\n                6: ['giugno'],\n                7: ['luglio'],\n                8: ['agosto', 'ag'],\n                9: ['settembre', 'sett'],\n                10: ['ottobre', 'ott'],\n                11: ['novembre'],\n                12: ['dicembre', 'dic'],\n            },\n            'fr': {\n                1: ['janv'],\n                2: ['févr'],\n                3: ['mars'],\n                4: ['avril'],\n                5: ['mai'],\n                6: ['juin'],\n                7: ['juil'],\n                8: ['août'],\n                9: ['sept'],\n                10: ['oct', 'octobre'],\n                11: ['nov', 'novembre'],\n                12: ['déc', 'décembre'],\n            },\n            'br': {\n                1: ['janeiro'],\n                2: ['fevereiro'],\n                3: ['março'],\n                4: ['abril'],\n                5: ['maio'],\n                6: ['junho'],\n                7: ['julho'],\n                8: ['agosto'],\n                9: ['setembro'],\n                10: ['outubro'],\n                11: ['novembro'],\n                12: ['dezembro'],\n            },\n            'es': {\n                1: ['enero'],\n                2: ['febrero'],\n                3: ['marzo'],\n                4: ['abril'],\n                5: ['mayo'],\n                6: ['junio'],\n                7: ['julio'],\n                8: ['agosto'],\n                9: ['septiembre', 'setiembre'],\n                10: ['octubre'],\n                11: ['noviembre'],\n                12: ['diciembre'],\n            },\n            'se': {\n                1: ['januari'],\n                2: ['februari'],\n                3: ['mars'],\n                4: ['april'],\n                5: ['maj'],\n                6: ['juni'],\n                7: ['juli'],\n                8: ['augusti'],\n                9: ['september'],\n                10: ['oktober'],\n                11: ['november'],\n                12: ['december'],\n            },\n            'jp': {\n                1: ['1月'],\n                2: ['2月'],\n                3: ['3月'],\n                4: ['4月'],\n                5: ['5月'],\n                6: ['6月'],\n                7: ['7月'],\n                8: ['8月'],\n                9: ['9月'],\n                10: ['10月'],\n                11: ['11月'],\n                12: ['12月'],\n            },\n            'nl': {\n                1: ['januari'], 2: ['februari'], 3: ['maart'], 5: ['mei'], 6: ['juni'], 7: ['juli'], 8: ['augustus'], 10: ['oktober'],\n            }\n\n        }  # }}}\n\n        self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',\n                               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n        self.months = months.get(self.domain, {})\n\n        self.pd_xpath = '''\n            //h2[text()=\"Product Details\" or \\\n                 text()=\"Produktinformation\" or \\\n                 text()=\"Dettagli prodotto\" or \\\n                 text()=\"Product details\" or \\\n                 text()=\"Détails sur le produit\" or \\\n                 text()=\"Detalles del producto\" or \\\n                 text()=\"Detalhes do produto\" or \\\n                 text()=\"Productgegevens\" or \\\n                 text()=\"基本信息\" or \\\n                 starts-with(text(), \"登録情報\")]/../div[@class=\"content\"]\n            '''\n        # Editor: is for Spanish\n        self.publisher_xpath = '''\n            descendant::*[starts-with(text(), \"Publisher:\") or \\\n                    starts-with(text(), \"Verlag:\") or \\\n                    starts-with(text(), \"Editore:\") or \\\n                    starts-with(text(), \"Editeur\") or \\\n                    starts-with(text(), \"Editor:\") or \\\n                    starts-with(text(), \"Editora:\") or \\\n                    starts-with(text(), \"Uitgever:\") or \\\n                    starts-with(text(), \"Utgivare:\") or \\\n                    starts-with(text(), \"出版社:\")]\n            '''\n        self.pubdate_xpath = '''\n            descendant::*[starts-with(text(), \"Publication Date:\") or \\\n                    starts-with(text(), \"Audible.com Release Date:\")]\n        '''\n        self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Utgivare', 'Herausgeber',\n                                'Editore', 'Editeur', 'Éditeur', 'Editor', 'Editora', '出版社'}\n\n        self.language_xpath = '''\n            descendant::*[\n                starts-with(text(), \"Language:\") \\\n                or text() = \"Language\" \\\n                or text() = \"Sprache:\" \\\n                or text() = \"Lingua:\" \\\n                or text() = \"Idioma:\" \\\n                or starts-with(text(), \"Langue\") \\\n                or starts-with(text(), \"言語\") \\\n                or starts-with(text(), \"Språk\") \\\n                or starts-with(text(), \"语种\")\n                ]\n            '''\n        self.language_names = {'Language', 'Sprache', 'Språk',\n                               'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}\n\n        self.tags_xpath = '''\n            descendant::h2[\n                text() = \"Look for Similar Items by Category\" or\n                text() = \"Ähnliche Artikel finden\" or\n                text() = \"Buscar productos similares por categoría\" or\n                text() = \"Ricerca articoli simili per categoria\" or\n                text() = \"Rechercher des articles similaires par rubrique\" or\n                text() = \"Procure por items similares por categoria\" or\n                text() = \"関連商品を探す\"\n            ]/../descendant::ul/li\n        '''\n\n        self.ratings_pat = re.compile(\n            r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '\n            r'([\\d\\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'\n        )\n        self.ratings_pat_cn = re.compile(r'([0-9.]+) 颗星,最多 5 颗星')\n        self.ratings_pat_jp = re.compile(r'\\d+つ星のうち([\\d\\.]+)')\n\n        lm = {\n            'eng': ('English', 'Englisch', 'Engels', 'Engelska'),\n            'fra': ('French', 'Français'),\n            'ita': ('Italian', 'Italiano'),\n            'deu': ('German', 'Deutsch'),\n            'spa': ('Spanish', 'Espa\\xf1ol', 'Espaniol'),\n            'jpn': ('Japanese', '日本語'),\n            'por': ('Portuguese', 'Português'),\n            'nld': ('Dutch', 'Nederlands',),\n            'chs': ('Chinese', '中文', '简体中文'),\n            'swe': ('Swedish', 'Svenska'),\n        }\n        self.lang_map = {}\n        for code, names in lm.items():\n            for name in names:\n                self.lang_map[name] = code\n\n        self.series_pat = re.compile(\n            r'''\n                \\|\\s*              # Prefix\n                (Series)\\s*:\\s*    # Series declaration\n                (?P<series>.+?)\\s+  # The series name\n                \\((Book)\\s*    # Book declaration\n                (?P<index>[0-9.]+) # Series index\n                \\s*\\)\n                ''', re.X)\n\n    def delocalize_datestr(self, raw):\n        if self.domain == 'cn':\n            return raw.replace('年', '-').replace('月', '-').replace('日', '')\n        if not self.months:\n            return raw\n        ans = raw.lower()\n        for i, vals in self.months.items():\n            for x in vals:\n                ans = ans.replace(x, self.english_months[i])\n        ans = ans.replace(' de ', ' ')\n        return ans\n\n    def run(self):\n        try:\n            self.get_details()\n        except:\n            self.log.exception('get_details failed for url: %r' % self.url)\n\n    def get_details(self):\n        if self.preparsed_root is None:\n            raw, root, selector = parse_details_page(\n                self.url, self.log, self.timeout, self.browser, self.domain)\n        else:\n            raw, root, selector = self.preparsed_root\n\n        from css_selectors import Select\n        self.selector = Select(root)\n        self.parse_details(raw, root)\n\n    def parse_details(self, raw, root):\n        asin = parse_asin(root, self.log, self.url)\n        if not asin and root.xpath('//form[@action=\"/errors/validateCaptcha\"]'):\n            raise CaptchaError(\n                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')\n        if self.testing:\n            import tempfile\n            import uuid\n            with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',\n                                             suffix='.html', delete=False) as f:\n                f.write(raw)\n            print('Downloaded HTML for', asin, 'saved in', f.name)\n\n        try:\n            title = self.parse_title(root)\n        except:\n            self.log.exception('Error parsing title for url: %r' % self.url)\n            title = None\n\n        try:\n            authors = self.parse_authors(root)\n        except:\n            self.log.exception('Error parsing authors for url: %r' % self.url)\n            authors = []\n\n        if not title or not authors or not asin:\n            self.log.error(\n                'Could not find title/authors/asin for %r' % self.url)\n            self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,\n                                                               authors))\n            return\n\n        mi = Metadata(title, authors)\n        idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain\n        mi.set_identifier(idtype, asin)\n        self.amazon_id = asin\n\n        try:\n            mi.rating = self.parse_rating(root)\n        except:\n            self.log.exception('Error parsing ratings for url: %r' % self.url)\n\n        try:\n            mi.comments = self.parse_comments(root, raw)\n        except:\n            self.log.exception('Error parsing comments for url: %r' % self.url)\n\n        try:\n            series, series_index = self.parse_series(root)\n            if series:\n                mi.series, mi.series_index = series, series_index\n            elif self.testing:\n                mi.series, mi.series_index = 'Dummy series for testing', 1\n        except:\n            self.log.exception('Error parsing series for url: %r' % self.url)\n\n        try:\n            mi.tags = self.parse_tags(root)\n        except:\n            self.log.exception('Error parsing tags for url: %r' % self.url)\n\n        try:\n            self.cover_url = self.parse_cover(root, raw)\n        except:\n            self.log.exception('Error parsing cover for url: %r' % self.url)\n        if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'):\n            self.cover_url = self.cover_url_processor(self.cover_url)\n        mi.has_cover = bool(self.cover_url)\n\n        detail_bullets = root.xpath('//*[@data-feature-name=\"detailBullets\"]')\n        non_hero = tuple(self.selector(\n            'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(\n                '#productDetails_techSpec_sections'))\n        feature_and_detail_bullets = root.xpath('//*[@data-feature-name=\"featureBulletsAndDetailBullets\"]')\n        if detail_bullets:\n            self.parse_detail_bullets(root, mi, detail_bullets[0])\n        elif non_hero:\n            try:\n                self.parse_new_details(root, mi, non_hero[0])\n            except:\n                self.log.exception(\n                    'Failed to parse new-style book details section')\n        elif feature_and_detail_bullets:\n            self.parse_detail_bullets(root, mi, feature_and_detail_bullets[0], ul_selector='ul')\n\n        else:\n            pd = root.xpath(self.pd_xpath)\n            if pd:\n                pd = pd[0]\n\n                try:\n                    isbn = self.parse_isbn(pd)\n                    if isbn:\n                        self.isbn = mi.isbn = isbn\n                except:\n                    self.log.exception(\n                        'Error parsing ISBN for url: %r' % self.url)\n\n                try:\n                    mi.publisher = self.parse_publisher(pd)\n                except:\n                    self.log.exception(\n                        'Error parsing publisher for url: %r' % self.url)\n\n                try:\n                    mi.pubdate = self.parse_pubdate(pd)\n                except:\n                    self.log.exception(\n                        'Error parsing publish date for url: %r' % self.url)\n\n                try:\n                    lang = self.parse_language(pd)\n                    if lang:\n                        mi.language = lang\n                except:\n                    self.log.exception(\n                        'Error parsing language for url: %r' % self.url)\n\n            else:\n                self.log.warning(\n                    'Failed to find product description for url: %r' % self.url)\n\n        mi.source_relevance = self.relevance\n\n        if self.amazon_id:\n            if self.isbn:\n                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)\n            if self.cover_url:\n                self.plugin.cache_identifier_to_cover_url(self.amazon_id,\n                                                          self.cover_url)\n\n        self.plugin.clean_downloaded_metadata(mi)\n\n        if self.filter_result(mi, self.log):\n            self.result_queue.put(mi)\n\n    def totext(self, elem, only_printable=False):\n        res = self.tostring(elem, encoding='unicode', method='text')\n        if only_printable:\n            try:\n                filtered_characters = [s for s in res if s.isprintable()]\n            except AttributeError:\n                filtered_characters = [s for s in res if s in string.printable]\n            res = ''.join(filtered_characters)\n        return res.strip()\n\n    def parse_title(self, root):\n\n        def sanitize_title(title):\n            ans = title.strip()\n            if not ans.startswith('['):\n                ans = re.sub(r'[(\\[].*[)\\]]', '', title).strip()\n            return ans\n\n        h1 = root.xpath('//h1[@id=\"title\"]')\n        if h1:\n            h1 = h1[0]\n            for child in h1.xpath('./*[contains(@class, \"a-color-secondary\")]'):\n                h1.remove(child)\n            return sanitize_title(self.totext(h1))\n        # audiobooks\n        elem = root.xpath('//*[@id=\"productTitle\"]')\n        if elem:\n            return sanitize_title(self.totext(elem[0]))\n        tdiv = root.xpath('//h1[contains(@class, \"parseasinTitle\")]')\n        if not tdiv:\n            span = root.xpath('//*[@id=\"ebooksTitle\"]')\n            if span:\n                return sanitize_title(self.totext(span[0]))\n            h1 = root.xpath('//h1[@data-feature-name=\"title\"]')\n            if h1:\n                return sanitize_title(self.totext(h1[0]))\n            raise ValueError('No title block found')\n        tdiv = tdiv[0]\n        actual_title = tdiv.xpath('descendant::*[@id=\"btAsinTitle\"]')\n        if actual_title:\n            title = self.tostring(actual_title[0], encoding='unicode',\n                                  method='text').strip()\n        else:\n            title = self.tostring(tdiv, encoding='unicode',\n                                  method='text').strip()\n        return sanitize_title(title)\n\n    def parse_authors(self, root):\n        for sel in (\n                '#byline .author .contributorNameID',\n                '#byline .author a.a-link-normal',\n                '#bylineInfo .author .contributorNameID',\n                '#bylineInfo .author a.a-link-normal',\n                '#bylineInfo #bylineContributor',\n                '#bylineInfo #contributorLink',\n        ):\n            matches = tuple(self.selector(sel))\n            if matches:\n                authors = [self.totext(x) for x in matches]\n                return [a for a in authors if a]\n\n        x = '//h1[contains(@class, \"parseasinTitle\")]/following-sibling::span/*[(name()=\"a\" and @href) or (name()=\"span\" and @class=\"contributorNameTrigger\")]'\n        aname = root.xpath(x)\n        if not aname:\n            aname = root.xpath('''\n            //h1[contains(@class, \"parseasinTitle\")]/following-sibling::*[(name()=\"a\" and @href) or (name()=\"span\" and @class=\"contributorNameTrigger\")]\n                    ''')\n        for x in aname:\n            x.tail = ''\n        authors = [self.tostring(x, encoding='unicode', method='text').strip() for x\n                   in aname]\n        authors = [a for a in authors if a]\n        return authors\n\n    def parse_rating(self, root):\n        for x in root.xpath('//div[@id=\"cpsims-feature\" or @id=\"purchase-sims-feature\" or @id=\"rhf\"]'):\n            # Remove the similar books section as it can cause spurious\n            # ratings matches\n            x.getparent().remove(x)\n\n        rating_paths = (\n            '//div[@data-feature-name=\"averageCustomerReviews\" or @id=\"averageCustomerReviews\"]',\n            '//div[@class=\"jumpBar\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]',\n            '//div[@class=\"buying\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]',\n            '//span[@class=\"crAvgStars\"]/descendant::span[contains(@class,\"asinReviewsSummary\")]'\n        )\n        ratings = None\n        for p in rating_paths:\n            ratings = root.xpath(p)\n            if ratings:\n                break\n\n        def parse_ratings_text(text):\n            try:\n                m = self.ratings_pat.match(text)\n                return float(m.group(1).replace(',', '.')) / float(m.group(3)) * 5\n            except Exception:\n                pass\n\n        if ratings:\n            ratings = ratings[0]\n            for elem in ratings.xpath('descendant::*[@title]'):\n                t = elem.get('title').strip()\n                if self.domain == 'cn':\n                    m = self.ratings_pat_cn.match(t)\n                    if m is not None:\n                        return float(m.group(1))\n                elif self.domain == 'jp':\n                    m = self.ratings_pat_jp.match(t)\n                    if m is not None:\n                        return float(m.group(1))\n                else:\n                    ans = parse_ratings_text(t)\n                    if ans is not None:\n                        return ans\n            for elem in ratings.xpath('descendant::span[@class=\"a-icon-alt\"]'):\n                t = self.tostring(\n                    elem, encoding='unicode', method='text', with_tail=False).strip()\n                ans = parse_ratings_text(t)\n                if ans is not None:\n                    return ans\n        else:\n            # found in kindle book pages on amazon.com\n            for x in root.xpath('//a[@id=\"acrCustomerReviewLink\"]'):\n                spans = x.xpath('./span')\n                if spans:\n                    txt = self.tostring(spans[0], method='text', encoding='unicode', with_tail=False).strip()\n                    try:\n                        return float(txt.replace(',', '.'))\n                    except Exception:\n                        pass\n\n    def _render_comments(self, desc):\n        from calibre.library.comments import sanitize_comments_html\n\n        for c in desc.xpath('descendant::noscript'):\n            c.getparent().remove(c)\n        for c in desc.xpath('descendant::*[@class=\"seeAll\" or'\n                            ' @class=\"emptyClear\" or @id=\"collapsePS\" or'\n                            ' @id=\"expandPS\"]'):\n            c.getparent().remove(c)\n        for b in desc.xpath('descendant::b[@style]'):\n            # Bing highlights search results\n            s = b.get('style', '')\n            if 'color' in s:\n                b.tag = 'span'\n                del b.attrib['style']\n\n        for a in desc.xpath('descendant::a[@href]'):\n            del a.attrib['href']\n            a.tag = 'span'\n        for a in desc.xpath('descendant::span[@class=\"a-text-italic\"]'):\n            a.tag = 'i'\n        for a in desc.xpath('descendant::span[@class=\"a-text-bold\"]'):\n            a.tag = 'b'\n        desc = self.tostring(desc, method='html', encoding='unicode').strip()\n        desc = xml_replace_entities(desc, 'utf-8')\n\n        # Encoding bug in Amazon data U+fffd (replacement char)\n        # in some examples it is present in place of '\n        desc = desc.replace('\\ufffd', \"'\")\n        # remove all attributes from tags\n        desc = re.sub(r'<([a-zA-Z0-9]+)\\s[^>]+>', r'<\\1>', desc)\n        # Collapse whitespace\n        # desc = re.sub(r'\\n+', '\\n', desc)\n        # desc = re.sub(r' +', ' ', desc)\n        # Remove the notice about text referring to out of print editions\n        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)\n        # Remove comments\n        desc = re.sub(r'(?s)<!--.*?-->', '', desc)\n        return sanitize_comments_html(desc)\n\n    def parse_comments(self, root, raw):\n        try:\n            from urllib.parse import unquote\n        except ImportError:\n            from urllib import unquote\n        ans = ''\n        ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionOverviewContent_feature_div')) or tuple(\n            self.selector('#drengr_DesktopTabbedDescriptionOverviewContent_feature_div'))\n        if ovr:\n            ovr = ovr[0]\n            ovr.tag = 'div'\n            ans = self._render_comments(ovr)\n            ovr = tuple(self.selector('#drengr_MobileTabbedDescriptionEditorialsContent_feature_div')) or tuple(\n                self.selector('#drengr_DesktopTabbedDescriptionEditorialsContent_feature_div'))\n            if ovr:\n                ovr = ovr[0]\n                ovr.tag = 'div'\n                ans += self._render_comments(ovr)\n        else:\n            ns = tuple(self.selector('#bookDescription_feature_div noscript'))\n            if ns:\n                ns = ns[0]\n                if len(ns) == 0 and ns.text:\n                    import html5lib\n\n                    # html5lib parsed noscript as CDATA\n                    ns = html5lib.parseFragment(\n                        '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]\n                else:\n                    ns.tag = 'div'\n                ans = self._render_comments(ns)\n            else:\n                desc = root.xpath('//div[@id=\"ps-content\"]/div[@class=\"content\"]')\n                if desc:\n                    ans = self._render_comments(desc[0])\n                else:\n                    ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content'))\n                    if ns:\n                        ans = self._render_comments(ns[0])\n        # audiobooks\n        if not ans:\n            elem = root.xpath('//*[@id=\"audible_desktopTabbedDescriptionOverviewContent_feature_div\"]')\n            if elem:\n                ans = self._render_comments(elem[0])\n        desc = root.xpath(\n            '//div[@id=\"productDescription\"]/*[@class=\"content\"]')\n        if desc:\n            ans += self._render_comments(desc[0])\n        else:\n            # Idiot chickens from amazon strike again. This data is now stored\n            # in a JS variable inside a script tag URL encoded.\n            m = re.search(br'var\\s+iframeContent\\s*=\\s*\"([^\"]+)\"', raw)\n            if m is not None:\n                try:\n                    text = unquote(m.group(1)).decode('utf-8')\n                    nr = parse_html(text)\n                    desc = nr.xpath(\n                        '//div[@id=\"productDescription\"]/*[@class=\"content\"]')\n                    if desc:\n                        ans += self._render_comments(desc[0])\n                except Exception as e:\n                    self.log.warn(\n                        'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))\n            else:\n                desc = root.xpath('//div[@id=\"productDescription_fullView\"]')\n                if desc:\n                    ans += self._render_comments(desc[0])\n\n        return ans\n\n    def parse_series(self, root):\n        ans = (None, None)\n\n        # This is found on kindle pages for books on amazon.com\n        series = root.xpath('//*[@id=\"rpi-attribute-book_details-series\"]')\n        if series:\n            spans = series[0].xpath('descendant::span')\n            if spans:\n                texts = [self.tostring(x, encoding='unicode', method='text', with_tail=False).strip() for x in spans]\n                texts = list(filter(None, texts))\n                if len(texts) == 2:\n                    idxinfo, series = texts\n                    m = re.search(r'[0-9.]+', idxinfo.strip())\n                    if m is not None:\n                        ans = series, float(m.group())\n                        return ans\n\n        # This is found on the paperback/hardback pages for books on amazon.com\n        series = root.xpath('//div[@data-feature-name=\"seriesTitle\"]')\n        if series:\n            series = series[0]\n            spans = series.xpath('./span')\n            if spans:\n                raw = self.tostring(\n                    spans[0], encoding='unicode', method='text', with_tail=False).strip()\n                m = re.search(r'\\s+([0-9.]+)$', raw.strip())\n                if m is not None:\n                    series_index = float(m.group(1))\n                    s = series.xpath('./a[@id=\"series-page-link\"]')\n                    if s:\n                        series = self.tostring(\n                            s[0], encoding='unicode', method='text', with_tail=False).strip()\n                        if series:\n                            ans = (series, series_index)\n        else:\n            series = root.xpath('//div[@id=\"seriesBulletWidget_feature_div\"]')\n            if series:\n                a = series[0].xpath('descendant::a')\n                if a:\n                    raw = self.tostring(a[0], encoding='unicode', method='text', with_tail=False)\n                    if self.domain == 'jp':\n                        m = re.search(r'(?P<index>[0-9.]+)\\s*(?:巻|冊)\\s*\\(全\\s*([0-9.]+)\\s*(?:巻|冊)\\):\\s*(?P<series>.+)', raw.strip())\n                    else:\n                        m = re.search(r'(?:Book|Libro|Buch)\\s+(?P<index>[0-9.]+)\\s+(?:of|de|von)\\s+([0-9.]+)\\s*:\\s*(?P<series>.+)', raw.strip())\n                    if m is not None:\n                        ans = (m.group('series').strip(), float(m.group('index')))\n\n        # This is found on Kindle edition pages on amazon.com\n        if ans == (None, None):\n            for span in root.xpath('//div[@id=\"aboutEbooksSection\"]//li/span'):\n                text = (span.text or '').strip()\n                m = re.match(r'Book\\s+([0-9.]+)', text)\n                if m is not None:\n                    series_index = float(m.group(1))\n                    a = span.xpath('./a[@href]')\n                    if a:\n                        series = self.tostring(\n                            a[0], encoding='unicode', method='text', with_tail=False).strip()\n                        if series:\n                            ans = (series, series_index)\n        # This is found on newer Kindle edition pages on amazon.com\n        if ans == (None, None):\n            for b in root.xpath('//div[@id=\"reviewFeatureGroup\"]/span/b'):\n                text = (b.text or '').strip()\n                m = re.match(r'Book\\s+([0-9.]+)', text)\n                if m is not None:\n                    series_index = float(m.group(1))\n                    a = b.getparent().xpath('./a[@href]')\n                    if a:\n                        series = self.tostring(\n                            a[0], encoding='unicode', method='text', with_tail=False).partition('(')[0].strip()\n                        if series:\n                            ans = series, series_index\n\n        if ans == (None, None):\n            desc = root.xpath('//div[@id=\"ps-content\"]/div[@class=\"buying\"]')\n            if desc:\n                raw = self.tostring(desc[0], method='text', encoding='unicode')\n                raw = re.sub(r'\\s+', ' ', raw)\n                match = self.series_pat.search(raw)\n                if match is not None:\n                    s, i = match.group('series'), float(match.group('index'))\n                    if s:\n                        ans = (s, i)\n        if ans[0]:\n            ans = (re.sub(r'\\s+Series$', '', ans[0]).strip(), ans[1])\n            ans = (re.sub(r'\\(.+?\\s+Series\\)$', '', ans[0]).strip(), ans[1])\n        return ans\n\n    def parse_tags(self, root):\n        ans = []\n        exclude_tokens = {'kindle', 'a-z'}\n        exclude = {'special features', 'by authors',\n                   'authors & illustrators', 'books', 'new; used & rental textbooks'}\n        seen = set()\n        for li in root.xpath(self.tags_xpath):\n            for i, a in enumerate(li.iterdescendants('a')):\n                if i > 0:\n                    # we ignore the first category since it is almost always\n                    # too broad\n                    raw = (a.text or '').strip().replace(',', ';')\n                    lraw = icu_lower(raw)\n                    tokens = frozenset(lraw.split())\n                    if raw and lraw not in exclude and not tokens.intersection(exclude_tokens) and lraw not in seen:\n                        ans.append(raw)\n                        seen.add(lraw)\n        return ans\n\n    def parse_cover(self, root, raw=b''):\n        # Look for the image URL in javascript, using the first image in the\n        # image gallery as the cover\n        import json\n        imgpat = re.compile(r'\"hiRes\":\"(.+?)\",\"thumb\"')\n        for script in root.xpath('//script'):\n            m = imgpat.search(script.text or '')\n            if m is not None:\n                return m.group(1)\n        imgpat = re.compile(r''''imageGalleryData'\\s*:\\s*(\\[\\s*{.+])''')\n        for script in root.xpath('//script'):\n            m = imgpat.search(script.text or '')\n            if m is not None:\n                try:\n                    return json.loads(m.group(1))[0]['mainUrl']\n                except Exception:\n                    continue\n\n        def clean_img_src(src):\n            parts = src.split('/')\n            if len(parts) > 3:\n                bn = parts[-1]\n                sparts = bn.split('_')\n                if len(sparts) > 2:\n                    bn = re.sub(r'\\.\\.jpg$', '.jpg', (sparts[0] + sparts[-1]))\n                    return ('/'.join(parts[:-1])) + '/' + bn\n\n        imgpat2 = re.compile(r'var imageSrc = \"([^\"]+)\"')\n        for script in root.xpath('//script'):\n            m = imgpat2.search(script.text or '')\n            if m is not None:\n                src = m.group(1)\n                url = clean_img_src(src)\n                if url:\n                    return url\n\n        imgs = root.xpath(\n            '//img[(@id=\"prodImage\" or @id=\"original-main-image\" or @id=\"main-image\" or @id=\"main-image-nonjs\") and @src]')\n        if not imgs:\n            imgs = (\n                root.xpath('//div[@class=\"main-image-inner-wrapper\"]/img[@src]') or\n                root.xpath('//div[@id=\"main-image-container\" or @id=\"ebooks-main-image-container\"]//img[@src]') or\n                root.xpath(\n                    '//div[@id=\"mainImageContainer\"]//img[@data-a-dynamic-image]')\n            )\n            for img in imgs:\n                try:\n                    idata = json.loads(img.get('data-a-dynamic-image'))\n                except Exception:\n                    imgs = ()\n                else:\n                    mwidth = 0\n                    try:\n                        url = None\n                        for iurl, (width, height) in idata.items():\n                            if width > mwidth:\n                                mwidth = width\n                                url = iurl\n\n                        return url\n                    except Exception:\n                        pass\n\n        for img in imgs:\n            src = img.get('src')\n            if 'data:' in src:\n                continue\n            if 'loading-' in src:\n                js_img = re.search(br'\"largeImage\":\"(https?://[^\"]+)\",', raw)\n                if js_img:\n                    src = js_img.group(1).decode('utf-8')\n            if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):\n                self.log('Found image: %s' % src)\n                url = clean_img_src(src)\n                if url:\n                    return url\n\n    def parse_detail_bullets(self, root, mi, container, ul_selector='.detail-bullet-list'):\n        try:\n            ul = next(self.selector(ul_selector, root=container))\n        except StopIteration:\n            return\n        for span in self.selector('.a-list-item', root=ul):\n            cells = span.xpath('./span')\n            if len(cells) >= 2:\n                self.parse_detail_cells(mi, cells[0], cells[1])\n\n    def parse_new_details(self, root, mi, non_hero):\n        table = non_hero.xpath('descendant::table')[0]\n        for tr in table.xpath('descendant::tr'):\n            cells = tr.xpath('descendant::*[local-name()=\"td\" or local-name()=\"th\"]')\n            if len(cells) == 2:\n                self.parse_detail_cells(mi, cells[0], cells[1])\n\n    def parse_detail_cells(self, mi, c1, c2):\n        name = self.totext(c1, only_printable=True).strip().strip(':').strip()\n        val = self.totext(c2)\n        val = val.replace('\\u200e', '').replace('\\u200f', '')\n        if not val:\n            return\n        if name in self.language_names:\n            ans = self.lang_map.get(val)\n            if not ans:\n                ans = canonicalize_lang(val)\n            if ans:\n                mi.language = ans\n        elif name in self.publisher_names:\n            pub = val.partition(';')[0].partition('(')[0].strip()\n            if pub:\n                mi.publisher = pub\n            date = val.rpartition('(')[-1].replace(')', '').strip()\n            try:\n                from calibre.utils.date import parse_only_date\n                date = self.delocalize_datestr(date)\n                mi.pubdate = parse_only_date(date, assume_utc=True)\n            except:\n                self.log.exception('Failed to parse pubdate: %s' % val)\n        elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:\n            ans = check_isbn(val)\n            if ans:\n                self.isbn = mi.isbn = ans\n        elif name in {'Publication date'}:\n            from calibre.utils.date import parse_only_date\n            date = self.delocalize_datestr(val)\n            mi.pubdate = parse_only_date(date, assume_utc=True)\n\n    def parse_isbn(self, pd):\n        items = pd.xpath(\n            'descendant::*[starts-with(text(), \"ISBN\")]')\n        if not items:\n            items = pd.xpath(\n                'descendant::b[contains(text(), \"ISBN:\")]')\n        for x in reversed(items):\n            if x.tail:\n                ans = check_isbn(x.tail.strip())\n                if ans:\n                    return ans\n\n    def parse_publisher(self, pd):\n        for x in reversed(pd.xpath(self.publisher_xpath)):\n            if x.tail:\n                ans = x.tail.partition(';')[0]\n                return ans.partition('(')[0].strip()\n\n    def parse_pubdate(self, pd):\n        from calibre.utils.date import parse_only_date\n        for x in reversed(pd.xpath(self.pubdate_xpath)):\n            if x.tail:\n                date = x.tail.strip()\n                date = self.delocalize_datestr(date)\n                try:\n                    return parse_only_date(date, assume_utc=True)\n                except Exception:\n                    pass\n        for x in reversed(pd.xpath(self.publisher_xpath)):\n            if x.tail:\n                ans = x.tail\n                date = ans.rpartition('(')[-1].replace(')', '').strip()\n                date = self.delocalize_datestr(date)\n                try:\n                    return parse_only_date(date, assume_utc=True)\n                except Exception:\n                    pass\n\n    def parse_language(self, pd):\n        for x in reversed(pd.xpath(self.language_xpath)):\n            if x.tail:\n                raw = x.tail.strip().partition(',')[0].strip()\n                ans = self.lang_map.get(raw, None)\n                if ans:\n                    return ans\n                ans = canonicalize_lang(ans)\n                if ans:\n                    return ans\n# }}}\n\n\nclass Amazon(Source):\n\n    name = 'Amazon.com'\n    version = (1, 3, 13)\n    minimum_calibre_version = (2, 82, 0)\n    description = _('Downloads metadata and covers from Amazon')\n\n    capabilities = frozenset(('identify', 'cover'))\n    touched_fields = frozenset(('title', 'authors', 'identifier:amazon',\n        'rating', 'comments', 'publisher', 'pubdate',\n        'languages', 'series', 'tags'))\n    has_html_comments = True\n    supports_gzip_transfer_encoding = True\n    prefer_results_with_isbn = False\n\n    AMAZON_DOMAINS = {\n        'com': _('US'),\n        'fr': _('France'),\n        'de': _('Germany'),\n        'uk': _('UK'),\n        'au': _('Australia'),\n        'it': _('Italy'),\n        'jp': _('Japan'),\n        'es': _('Spain'),\n        'br': _('Brazil'),\n        'in': _('India'),\n        'nl': _('Netherlands'),\n        'cn': _('China'),\n        'ca': _('Canada'),\n        'se': _('Sweden'),\n    }\n\n    SERVERS = {\n        'auto': _('Choose server automatically'),\n        'amazon': _('Amazon servers'),\n        'bing': _('Bing search cache'),\n        'google': _('Google search cache'),\n        'wayback': _('Wayback machine cache (slow)'),\n        'ddg': _('DuckDuckGo search and Google cache'),\n    }\n\n    options = (\n        Option('domain', 'choices', 'com', _('Amazon country website to use:'),\n               _('Metadata from Amazon will be fetched using this '\n                 \"country's Amazon website.\"), choices=AMAZON_DOMAINS),\n        Option('server', 'choices', 'auto', _('Server to get data from:'),\n               _(\n                   'Amazon has started blocking attempts to download'\n                   ' metadata from its servers. To get around this problem,'\n                   ' calibre can fetch the Amazon data from many different'\n                   ' places where it is cached. Choose the source you prefer.'\n               ), choices=SERVERS),\n        Option('use_mobi_asin', 'bool', False, _('Use the MOBI-ASIN for metadata search'),\n               _(\n                   'Enable this option to search for metadata with an'\n                   ' ASIN identifier from the MOBI file at the current country website,'\n                   ' unless any other amazon id is available. Note that if the'\n                   ' MOBI file came from a different Amazon country store, you could get'\n                   ' incorrect results.'\n               )),\n        Option('prefer_kindle_edition', 'bool', False, _('Prefer the Kindle edition, when available'),\n               _(\n                   'When searching for a book and the search engine returns both paper and Kindle editions,'\n                   ' always prefer the Kindle edition, instead of whatever the search engine returns at the'\n                   ' top.')\n        ),\n    )\n\n    def __init__(self, *args, **kwargs):\n        Source.__init__(self, *args, **kwargs)\n        self.set_amazon_id_touched_fields()\n\n    def id_from_url(self, url):\n        from polyglot.urllib import urlparse\n        purl = urlparse(url)\n        if purl.netloc and purl.path and '/dp/' in purl.path:\n            host_parts = tuple(x.lower() for x in purl.netloc.split('.'))\n            if 'amazon' in host_parts:\n                domain = host_parts[-1]\n            parts = purl.path.split('/')\n            idx = parts.index('dp')\n            try:\n                val = parts[idx+1]\n            except IndexError:\n                return\n            aid = 'amazon' if domain == 'com' else ('amazon_' + domain)\n            return aid, val\n\n    def test_fields(self, mi):\n        '''\n        Return the first field from self.touched_fields that is null on the\n        mi object\n        '''\n        for key in self.touched_fields:\n            if key.startswith('identifier:'):\n                key = key.partition(':')[-1]\n                if key == 'amazon':\n                    if self.domain != 'com':\n                        key += '_' + self.domain\n                if not mi.has_identifier(key):\n                    return 'identifier: ' + key\n            elif mi.is_null(key):\n                return key\n\n    @property\n    def browser(self):\n        br = self._browser\n        if br is None:\n            ua = 'Mobile '\n            while not user_agent_is_ok(ua):\n                ua = random_user_agent(allow_ie=False)\n            # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n            self._browser = br = browser(user_agent=ua)\n            br.set_handle_gzip(True)\n            if self.use_search_engine:\n                br.addheaders += [\n                    ('Accept', accept_header_for_ua(ua)),\n                    ('Upgrade-insecure-requests', '1'),\n                ]\n            else:\n                br.addheaders += [\n                    ('Accept', accept_header_for_ua(ua)),\n                    ('Upgrade-insecure-requests', '1'),\n                    ('Referer', self.referrer_for_domain()),\n                ]\n        return br\n\n    def save_settings(self, *args, **kwargs):\n        Source.save_settings(self, *args, **kwargs)\n        self.set_amazon_id_touched_fields()\n\n    def set_amazon_id_touched_fields(self):\n        ident_name = 'identifier:amazon'\n        if self.domain != 'com':\n            ident_name += '_' + self.domain\n        tf = [x for x in self.touched_fields if not\n              x.startswith('identifier:amazon')] + [ident_name]\n        self.touched_fields = frozenset(tf)\n\n    def get_domain_and_asin(self, identifiers, extra_domains=()):\n        identifiers = {k.lower(): v for k, v in identifiers.items()}\n        for key, val in identifiers.items():\n            if key in ('amazon', 'asin'):\n                return 'com', val\n            if key.startswith('amazon_'):\n                domain = key.partition('_')[-1]\n                if domain and (domain in self.AMAZON_DOMAINS or domain in extra_domains):\n                    return domain, val\n        if self.prefs['use_mobi_asin']:\n            val = identifiers.get('mobi-asin')\n            if val is not None:\n                return self.domain, val\n        return None, None\n\n    def referrer_for_domain(self, domain=None):\n        domain = domain or self.domain\n        return {\n            'uk':  'https://www.amazon.co.uk/',\n            'au':  'https://www.amazon.com.au/',\n            'br':  'https://www.amazon.com.br/',\n            'jp':  'https://www.amazon.co.jp/',\n            'mx':  'https://www.amazon.com.mx/',\n        }.get(domain, 'https://www.amazon.%s/' % domain)\n\n    def _get_book_url(self, identifiers):  # {{{\n        domain, asin = self.get_domain_and_asin(\n            identifiers, extra_domains=('au', 'ca'))\n        if domain and asin:\n            url = None\n            r = self.referrer_for_domain(domain)\n            if r is not None:\n                url = r + 'dp/' + asin\n            if url:\n                idtype = 'amazon' if domain == 'com' else 'amazon_' + domain\n                return domain, idtype, asin, url\n\n    def get_book_url(self, identifiers):\n        ans = self._get_book_url(identifiers)\n        if ans is not None:\n            return ans[1:]\n\n    def get_book_url_name(self, idtype, idval, url):\n        if idtype == 'amazon':\n            return self.name\n        return 'A' + idtype.replace('_', '.')[1:]\n    # }}}\n\n    @property\n    def domain(self):\n        x = getattr(self, 'testing_domain', None)\n        if x is not None:\n            return x\n        domain = self.prefs['domain']\n        if domain not in self.AMAZON_DOMAINS:\n            domain = 'com'\n\n        return domain\n\n    @property\n    def server(self):\n        x = getattr(self, 'testing_server', None)\n        if x is not None:\n            return x\n        server = self.prefs['server']\n        if server not in self.SERVERS:\n            server = 'auto'\n        return server\n\n    @property\n    def use_search_engine(self):\n        return self.server != 'amazon'\n\n    def clean_downloaded_metadata(self, mi):\n        docase = (\n            mi.language == 'eng' or\n            (mi.is_null('language') and self.domain in {'com', 'uk', 'au'})\n        )\n        if mi.title and docase:\n            # Remove series information from title\n            m = re.search(r'\\S+\\s+(\\(.+?\\s+Book\\s+\\d+\\))$', mi.title)\n            if m is not None:\n                mi.title = mi.title.replace(m.group(1), '').strip()\n            mi.title = fixcase(mi.title)\n        mi.authors = fixauthors(mi.authors)\n        if mi.tags and docase:\n            mi.tags = list(map(fixcase, mi.tags))\n        mi.isbn = check_isbn(mi.isbn)\n        if mi.series and docase:\n            mi.series = fixcase(mi.series)\n        if mi.title and mi.series:\n            for pat in (r':\\s*Book\\s+\\d+\\s+of\\s+%s$', r'\\(%s\\)$', r':\\s*%s\\s+Book\\s+\\d+$'):\n                pat = pat % re.escape(mi.series)\n                q = re.sub(pat, '', mi.title, flags=re.I).strip()\n                if q and q != mi.title:\n                    mi.title = q\n                    break\n\n    def get_website_domain(self, domain):\n        return {'uk': 'co.uk', 'jp': 'co.jp', 'br': 'com.br', 'au': 'com.au'}.get(domain, domain)\n\n    def create_query(self, log, title=None, authors=None, identifiers={},  # {{{\n                     domain=None, for_amazon=True):\n        try:\n            from urllib.parse import unquote_plus, urlencode\n        except ImportError:\n            from urllib import unquote_plus, urlencode\n        if domain is None:\n            domain = self.domain\n\n        idomain, asin = self.get_domain_and_asin(identifiers)\n        if idomain is not None:\n            domain = idomain\n\n        # See the amazon detailed search page to get all options\n        terms = []\n        q = {'search-alias': 'aps',\n             'unfiltered': '1',\n        }\n\n        if domain == 'com':\n            q['sort'] = 'relevanceexprank'\n        else:\n            q['sort'] = 'relevancerank'\n\n        isbn = check_isbn(identifiers.get('isbn', None))\n\n        if asin is not None:\n            q['field-keywords'] = asin\n            terms.append(asin)\n        elif isbn is not None:\n            q['field-isbn'] = isbn\n            if len(isbn) == 13:\n                terms.extend('({} OR {}-{})'.format(isbn, isbn[:3], isbn[3:]).split())\n            else:\n                terms.append(isbn)\n        else:\n            # Only return book results\n            q['search-alias'] = {'br': 'digital-text',\n                                 'nl': 'aps'}.get(domain, 'stripbooks')\n            if title:\n                title_tokens = list(self.get_title_tokens(title))\n                if title_tokens:\n                    q['field-title'] = ' '.join(title_tokens)\n                    terms.extend(title_tokens)\n            if authors:\n                author_tokens = list(self.get_author_tokens(authors,\n                                                       only_first_author=True))\n                if author_tokens:\n                    q['field-author'] = ' '.join(author_tokens)\n                    terms.extend(author_tokens)\n\n        if not ('field-keywords' in q or 'field-isbn' in q or\n                ('field-title' in q)):\n            # Insufficient metadata to make an identify query\n            log.error('Insufficient metadata to construct query, none of title, ISBN or ASIN supplied')\n            raise SearchFailed()\n\n        if not for_amazon:\n            return terms, domain\n\n        if domain == 'nl':\n            q['__mk_nl_NL'] = 'ÅMÅŽÕÑ'\n            if 'field-keywords' not in q:\n                q['field-keywords'] = ''\n            for f in 'field-isbn field-title field-author'.split():\n                q['field-keywords'] += ' ' + q.pop(f, '')\n            q['field-keywords'] = q['field-keywords'].strip()\n\n        encoded_q = {x.encode('utf-8', 'ignore'): y.encode('utf-8', 'ignore') for x, y in q.items()}\n        url_query = urlencode(encoded_q)\n        # amazon's servers want IRIs with unicode characters not percent esaped\n        parts = []\n        for x in url_query.split(b'&' if isinstance(url_query, bytes) else '&'):\n            k, v = x.split(b'=' if isinstance(x, bytes) else '=', 1)\n            parts.append('{}={}'.format(iri_quote_plus(unquote_plus(k)), iri_quote_plus(unquote_plus(v))))\n        url_query = '&'.join(parts)\n        url = 'https://www.amazon.%s/s/?' % self.get_website_domain(\n            domain) + url_query\n        return url, domain\n\n    # }}}\n\n    def get_cached_cover_url(self, identifiers):  # {{{\n        url = None\n        domain, asin = self.get_domain_and_asin(identifiers)\n        if asin is None:\n            isbn = identifiers.get('isbn', None)\n            if isbn is not None:\n                asin = self.cached_isbn_to_identifier(isbn)\n        if asin is not None:\n            url = self.cached_identifier_to_cover_url(asin)\n\n        return url\n    # }}}\n\n    def parse_results_page(self, root, domain):  # {{{\n        from lxml.html import tostring\n\n        matches = []\n\n        def title_ok(title):\n            title = title.lower()\n            bad = ['bulk pack', '[audiobook]', '[audio cd]',\n                   '(a book companion)', '( slipcase with door )', ': free sampler']\n            if self.domain == 'com':\n                bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])\n            for x in bad:\n                if x in title:\n                    return False\n            if title and title[0] in '[{' and re.search(r'\\(\\s*author\\s*\\)', title) is not None:\n                # Bad entries in the catalog\n                return False\n            return True\n\n        for query in (\n            '//div[contains(@class, \"s-result-list\")]//h2/a[@href]',\n            '//div[contains(@class, \"s-result-list\")]//div[@data-index]//h5//a[@href]',\n            r'//li[starts-with(@id, \"result_\")]//a[@href and contains(@class, \"s-access-detail-page\")]',\n            '//div[@data-cy=\"title-recipe\"]/a[@href]',\n        ):\n            result_links = root.xpath(query)\n            if result_links:\n                break\n        for a in result_links:\n            title = tostring(a, method='text', encoding='unicode')\n            if title_ok(title):\n                url = a.get('href')\n                if url.startswith('/'):\n                    url = 'https://www.amazon.%s%s' % (\n                        self.get_website_domain(domain), url)\n                matches.append(url)\n\n        if not matches:\n            # Previous generation of results page markup\n            for div in root.xpath(r'//div[starts-with(@id, \"result_\")]'):\n                links = div.xpath(r'descendant::a[@class=\"title\" and @href]')\n                if not links:\n                    # New amazon markup\n                    links = div.xpath('descendant::h3/a[@href]')\n                for a in links:\n                    title = tostring(a, method='text', encoding='unicode')\n                    if title_ok(title):\n                        url = a.get('href')\n                        if url.startswith('/'):\n                            url = 'https://www.amazon.%s%s' % (\n                                self.get_website_domain(domain), url)\n                        matches.append(url)\n                    break\n\n        if not matches:\n            # This can happen for some user agents that Amazon thinks are\n            # mobile/less capable\n            for td in root.xpath(\n                    r'//div[@id=\"Results\"]/descendant::td[starts-with(@id, \"search:Td:\")]'):\n                for a in td.xpath(r'descendant::td[@class=\"dataColumn\"]/descendant::a[@href]/span[@class=\"srTitle\"]/..'):\n                    title = tostring(a, method='text', encoding='unicode')\n                    if title_ok(title):\n                        url = a.get('href')\n                        if url.startswith('/'):\n                            url = 'https://www.amazon.%s%s' % (\n                                self.get_website_domain(domain), url)\n                        matches.append(url)\n                    break\n        if not matches and root.xpath('//form[@action=\"/errors/validateCaptcha\"]'):\n            raise CaptchaError('Amazon returned a CAPTCHA page. Recently Amazon has begun using statistical'\n                               ' profiling to block access to its website. As such this metadata plugin is'\n                               ' unlikely to ever work reliably.')\n\n        # Keep only the top 3 matches as the matches are sorted by relevance by\n        # Amazon so lower matches are not likely to be very relevant\n        return matches[:3]\n    # }}}\n\n    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):  # {{{\n        from calibre.ebooks.chardet import xml_to_unicode\n        from calibre.utils.cleantext import clean_ascii_chars\n        matches = []\n        query, domain = self.create_query(log, title=title, authors=authors,\n                                          identifiers=identifiers)\n        time.sleep(1)\n        try:\n            raw = br.open_novisit(query, timeout=timeout).read().strip()\n        except Exception as e:\n            if callable(getattr(e, 'getcode', None)) and \\\n                    e.getcode() == 404:\n                log.error('Query malformed: %r' % query)\n                raise SearchFailed()\n            attr = getattr(e, 'args', [None])\n            attr = attr if attr else [None]\n            if isinstance(attr[0], socket.timeout):\n                msg = _('Amazon timed out. Try again later.')\n                log.error(msg)\n            else:\n                msg = 'Failed to make identify query: %r' % query\n                log.exception(msg)\n            raise SearchFailed()\n\n        raw = clean_ascii_chars(xml_to_unicode(raw,\n                                               strip_encoding_pats=True, resolve_entities=True)[0])\n\n        if testing:\n            import tempfile\n            with tempfile.NamedTemporaryFile(prefix='amazon_results_',\n                                             suffix='.html', delete=False) as f:\n                f.write(raw.encode('utf-8'))\n            print('Downloaded html for results page saved in', f.name)\n\n        matches = []\n        found = '<title>404 - ' not in raw\n\n        if found:\n            try:\n                root = parse_html(raw)\n            except Exception:\n                msg = 'Failed to parse amazon page for query: %r' % query\n                log.exception(msg)\n                raise SearchFailed()\n\n        matches = self.parse_results_page(root, domain)\n\n        return matches, query, domain, None\n    # }}}\n\n    def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None):  # {{{\n        from calibre.ebooks.metadata.sources.update import search_engines_module\n        se = search_engines_module()\n        terms, domain = self.create_query(log, title=title, authors=authors,\n                                          identifiers=identifiers, for_amazon=False)\n        site = self.referrer_for_domain(\n            domain)[len('https://'):].partition('/')[0]\n        matches = []\n        server = override_server or self.server\n        if server == 'bing':\n            urlproc, sfunc = se.bing_url_processor, se.bing_search\n        elif server == 'wayback':\n            urlproc, sfunc = se.wayback_url_processor, se.ddg_search\n        elif server == 'ddg':\n            urlproc, sfunc = se.ddg_url_processor, se.ddg_search\n        elif server == 'google':\n            urlproc, sfunc = se.google_url_processor, se.google_search\n        else:  # auto or unknown\n            urlproc, sfunc = se.google_url_processor, se.google_search\n            # urlproc, sfunc = se.bing_url_processor, se.bing_search\n        try:\n            results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)\n        except HTTPError as err:\n            if err.code == 429 and sfunc is se.google_search:\n                log('Got too many requests error from Google, trying via DuckDuckGo')\n                urlproc, sfunc = se.ddg_url_processor, se.ddg_search\n                results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)\n            else:\n                raise\n\n        br.set_current_header('Referer', qurl)\n        for result in results:\n            if abort.is_set():\n                return matches, terms, domain, None\n\n            purl = urlparse(result.url)\n            if '/dp/' in purl.path and site in purl.netloc:\n                # We cannot use cached URL as wayback machine no longer caches\n                # amazon and Google and Bing web caches are no longer\n                # accessible.\n                url = result.url\n                if url not in matches:\n                    matches.append(url)\n                if len(matches) >= 3:\n                    break\n            else:\n                log('Skipping non-book result:', result)\n        if not matches:\n            log('No search engine results for terms:', ' '.join(terms))\n            if urlproc is se.google_url_processor:\n                # Google does not cache adult titles\n                log('Trying the bing search engine instead')\n                return self.search_search_engine(br, testing, log, abort, title, authors, identifiers, timeout, 'bing')\n        return matches, terms, domain, urlproc\n    # }}}\n\n    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{\n                 identifiers={}, timeout=60):\n        '''\n        Note this method will retry without identifiers automatically if no\n        match is found with identifiers.\n        '''\n\n        testing = getattr(self, 'running_a_test', False)\n\n        udata = self._get_book_url(identifiers)\n        br = self.browser\n        log('User-agent:', br.current_user_agent())\n        log('Server:', self.server)\n        if testing:\n            print('User-agent:', br.current_user_agent())\n        if udata is not None and not self.use_search_engine:\n            # Try to directly get details page instead of running a search\n            # Cannot use search engine as the directly constructed URL is\n            # usually redirected to a full URL by amazon, and is therefore\n            # not cached\n            domain, idtype, asin, durl = udata\n            if durl is not None:\n                preparsed_root = parse_details_page(\n                    durl, log, timeout, br, domain)\n                if preparsed_root is not None:\n                    qasin = parse_asin(preparsed_root[1], log, durl)\n                    if qasin == asin:\n                        w = Worker(durl, result_queue, br, log, 0, domain,\n                                   self, testing=testing, preparsed_root=preparsed_root, timeout=timeout)\n                        try:\n                            w.get_details()\n                            return\n                        except Exception:\n                            log.exception(\n                                'get_details failed for url: %r' % durl)\n        func = self.search_search_engine if self.use_search_engine else self.search_amazon\n        try:\n            matches, query, domain, cover_url_processor = func(\n                br, testing, log, abort, title, authors, identifiers, timeout)\n        except SearchFailed:\n            return\n\n        if abort.is_set():\n            return\n\n        if not matches:\n            if identifiers and title and authors:\n                log('No matches found with identifiers, retrying using only'\n                    ' title and authors. Query: %r' % query)\n                time.sleep(1)\n                return self.identify(log, result_queue, abort, title=title,\n                                     authors=authors, timeout=timeout)\n            log.error('No matches found with query: %r' % query)\n            return\n\n        if self.prefs['prefer_kindle_edition']:\n            matches = sort_matches_preferring_kindle_editions(matches)\n\n        workers = [Worker(\n            url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,\n            cover_url_processor=cover_url_processor, filter_result=partial(\n                self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)]\n\n        for w in workers:\n            # Don't send all requests at the same time\n            time.sleep(1)\n            w.start()\n            if abort.is_set():\n                return\n\n        while not abort.is_set():\n            a_worker_is_alive = False\n            for w in workers:\n                w.join(0.2)\n                if abort.is_set():\n                    break\n                if w.is_alive():\n                    a_worker_is_alive = True\n            if not a_worker_is_alive:\n                break\n\n        return None\n    # }}}\n\n    def filter_result(self, title, authors, identifiers, mi, log):  # {{{\n        if not self.use_search_engine:\n            return True\n        if title is not None:\n            import regex\n            only_punctuation_pat = regex.compile(r'^\\p{P}+$')\n\n            def tokenize_title(x):\n                ans = icu_lower(x).replace(\"'\", '').replace('\"', '').rstrip(':')\n                if only_punctuation_pat.match(ans) is not None:\n                    ans = ''\n                return ans\n\n            tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}\n            tokens.discard('')\n            if tokens:\n                result_tokens = {tokenize_title(x) for x in mi.title.split()}\n                result_tokens.discard('')\n                if not tokens.intersection(result_tokens):\n                    log('Ignoring result:', mi.title, 'as its title does not match')\n                    return False\n        if authors:\n            author_tokens = set()\n            for author in authors:\n                author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}\n            result_tokens = set()\n            for author in mi.authors:\n                result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}\n            if author_tokens and not author_tokens.intersection(result_tokens):\n                log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match')\n                return False\n        return True\n    # }}}\n\n    def download_cover(self, log, result_queue, abort,  # {{{\n                       title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):\n        cached_url = self.get_cached_cover_url(identifiers)\n        if cached_url is None:\n            log.info('No cached cover found, running identify')\n            rq = Queue()\n            self.identify(log, rq, abort, title=title, authors=authors,\n                          identifiers=identifiers)\n            if abort.is_set():\n                return\n            results = []\n            while True:\n                try:\n                    results.append(rq.get_nowait())\n                except Empty:\n                    break\n            results.sort(key=self.identify_results_keygen(\n                title=title, authors=authors, identifiers=identifiers))\n            for mi in results:\n                cached_url = self.get_cached_cover_url(mi.identifiers)\n                if cached_url is not None:\n                    break\n        if cached_url is None:\n            log.info('No cover found')\n            return\n\n        if abort.is_set():\n            return\n        log('Downloading cover from:', cached_url)\n        br = self.browser\n        if self.use_search_engine:\n            br = br.clone_browser()\n            br.set_current_header('Referer', self.referrer_for_domain(self.domain))\n        try:\n            time.sleep(1)\n            cdata = br.open_novisit(\n                cached_url, timeout=timeout).read()\n            result_queue.put((self, cdata))\n        except:\n            log.exception('Failed to download cover from:', cached_url)\n    # }}}\n\n\ndef manual_tests(domain, **kw):  # {{{\n    # To run these test use:\n    # calibre-debug -c \"from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')\"\n    from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test\n    all_tests = {}\n    all_tests['com'] = [  # {{{\n        (  # in title\n            {'title': 'Expert C# 2008 Business Objects',\n             'authors': ['Lhotka']},\n            [title_test('Expert C#'),\n             authors_test(['Rockford Lhotka'])\n             ]\n        ),\n\n        (   # Paperback with series\n            {'identifiers': {'amazon': '1423146786'}},\n            [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]\n        ),\n\n        (   # Kindle edition with series\n            {'identifiers': {'amazon': 'B0085UEQDO'}},\n            [title_test('Three Parts Dead', exact=True),\n             series_test('Craft Sequence', 1)]\n        ),\n\n        (  # + in title and uses id=\"main-image\" for cover\n            {'identifiers': {'amazon': '1933988770'}},\n            [title_test(\n                'C++ Concurrency in Action: Practical Multithreading', exact=True)]\n        ),\n\n\n        (  # Different comments markup, using Book Description section\n            {'identifiers': {'amazon': '0982514506'}},\n            [title_test(\n                \"Griffin's Destiny\",\n                exact=True),\n             comments_test('Jelena'), comments_test('Ashinji'),\n             ]\n        ),\n\n        (   # New search results page markup (Dec 2024)\n            {'title': 'Come si scrive un articolo medico-scientifico'},\n            [title_test('Come si scrive un articolo medico-scientifico', exact=True)]\n        ),\n\n        (  # No specific problems\n            {'identifiers': {'isbn': '0743273567'}},\n            [title_test('the great gatsby'),\n             authors_test(['f. Scott Fitzgerald'])]\n        ),\n\n    ]\n\n    # }}}\n\n    all_tests['de'] = [  # {{{\n        # series\n        (\n            {'identifiers': {'isbn': '3499275120'}},\n            [title_test('Vespasian: Das Schwert des Tribuns: Historischer Roman',\n                        exact=False), authors_test(['Robert Fabbri']), series_test('Die Vespasian-Reihe', 1)\n             ]\n\n        ),\n\n        (  # umlaut in title/authors\n            {'title': 'Flüsternde Wälder',\n             'authors': ['Nicola Förg']},\n            [title_test('Flüsternde Wälder'),\n             authors_test(['Nicola Förg'], subset=True)\n             ]\n        ),\n\n        (\n            {'identifiers': {'isbn': '9783453314979'}},\n            [title_test('Die letzten Wächter: Roman',\n                        exact=False), authors_test(['Sergej Lukianenko'])\n             ]\n\n        ),\n\n        (\n            {'identifiers': {'isbn': '3548283519'}},\n            [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',\n                        exact=False), authors_test(['Nele Neuhaus'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['it'] = [  # {{{\n        (\n            {'identifiers': {'isbn': '8838922195'}},\n            [title_test('La briscola in cinque',\n                        exact=True), authors_test(['Marco Malvaldi'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['fr'] = [  # {{{\n        (\n            {'identifiers': {'amazon_fr': 'B07L7ST4RS'}},\n            [title_test('Le secret de Lola', exact=True),\n                authors_test(['Amélie BRIZIO'])\n            ]\n        ),\n        (\n            {'identifiers': {'isbn': '2221116798'}},\n            [title_test(\"L'étrange voyage de Monsieur Daldry\",\n                        exact=True), authors_test(['Marc Levy'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['es'] = [  # {{{\n        (\n            {'identifiers': {'isbn': '8483460831'}},\n            [title_test('Tiempos Interesantes',\n                        exact=False), authors_test(['Terry Pratchett'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['se'] = [  # {{{\n        (\n            {'identifiers': {'isbn': '9780552140287'}},\n            [title_test('Men At Arms: A Discworld Novel: 14',\n                        exact=False), authors_test(['Terry Pratchett'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['jp'] = [  # {{{\n        (  # Adult filtering test\n            {'identifiers': {'isbn': '4799500066'}},\n            [title_test('Bitch Trap'), ]\n        ),\n\n        (  # isbn -> title, authors\n            {'identifiers': {'isbn': '9784101302720'}},\n            [title_test('精霊の守り人',\n                        exact=True), authors_test(['上橋 菜穂子'])\n             ]\n        ),\n        (  # title, authors -> isbn (will use Shift_JIS encoding in query.)\n            {'title': '考えない練習',\n             'authors': ['小池 龍之介']},\n            [isbn_test('9784093881067'), ]\n        ),\n    ]  # }}}\n\n    all_tests['br'] = [  # {{{\n        (\n            {'title': 'A Ascensão da Sombra'},\n            [title_test('A Ascensão da Sombra'), authors_test(['Robert Jordan'])]\n        ),\n\n        (\n            {'title': 'Guerra dos Tronos'},\n            [title_test('A Guerra dos Tronos. As Crônicas de Gelo e Fogo - Livro 1'), authors_test(['George R. R. Martin'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['nl'] = [  # {{{\n        (\n            {'title': 'Freakonomics'},\n            [title_test('Freakonomics',\n                        exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['cn'] = [  # {{{\n        (\n            {'identifiers': {'isbn': '9787115369512'}},\n            [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),\n             authors_test(['[美]sam Williams', '邓楠,李凡希'])]\n        ),\n        (\n            {'title': '爱上Raspberry Pi'},\n            [title_test('爱上Raspberry Pi',\n                        exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希'])\n             ]\n\n        ),\n    ]  # }}}\n\n    all_tests['ca'] = [  # {{{\n        (   # Paperback with series\n            {'identifiers': {'isbn': '9781623808747'}},\n            [title_test('Parting Shot', exact=True),\n             authors_test(['Mary Calmes'])]\n        ),\n        (  # in title\n            {'title': 'Expert C# 2008 Business Objects',\n             'authors': ['Lhotka']},\n            [title_test('Expert C# 2008 Business Objects'),\n             authors_test(['Rockford Lhotka'])]\n        ),\n        (  # noscript description\n            {'identifiers': {'amazon_ca': '162380874X'}},\n            [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])\n             ]\n        ),\n    ]  # }}}\n\n    all_tests['in'] = [  # {{{\n        (   # Paperback with series\n            {'identifiers': {'amazon_in': '1423146786'}},\n            [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True)]\n        ),\n    ]  # }}}\n\n    def do_test(domain, start=0, stop=None, server='auto'):\n        tests = all_tests[domain]\n        if stop is None:\n            stop = len(tests)\n        tests = tests[start:stop]\n        test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: (\n            setattr(p, 'testing_domain', domain),\n            setattr(p, 'touched_fields', p.touched_fields - {'tags'}),\n            setattr(p, 'testing_server', server),\n        ))\n\n    do_test(domain, **kw)\n# }}}\n",
  "big_book_search": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__   = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Option, Source\n\n\ndef get_urls(br, tokens):\n    from urllib.parse import quote_plus\n\n    from html5_parser import parse\n    escaped = (quote_plus(x) for x in tokens if x and x.strip())\n    q = '+'.join(escaped)\n    url = 'https://bigbooksearch.com/please-dont-scrape-my-site-you-will-put-my-api-key-over-the-usage-limit-and-the-site-will-break/books/'+q\n    raw = br.open(url).read()\n    root = parse(raw.decode('utf-8'))\n    urls = [i.get('src') for i in root.xpath('//img[@src]')]\n    return urls\n\n\nclass BigBookSearch(Source):\n\n    name = 'Big Book Search'\n    version = (1, 0, 1)\n    minimum_calibre_version = (2, 80, 0)\n    description = _('Downloads multiple book covers from Amazon. Useful to find alternate covers.')\n    capabilities = frozenset(['cover'])\n    can_get_multiple_covers = True\n    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n                      _('The maximum number of covers to process from the search result')),\n    )\n    supports_gzip_transfer_encoding = True\n\n    def download_cover(self, log, result_queue, abort,\n            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n        if not title:\n            return\n        br = self.browser\n        tokens = tuple(self.get_title_tokens(title)) + tuple(self.get_author_tokens(authors))\n        urls = get_urls(br, tokens)\n        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n\ndef test():\n    import pprint\n\n    from calibre import browser\n    br = browser()\n    urls = get_urls(br, ['consider', 'phlebas', 'banks'])\n    pprint.pprint(urls)\n\n\nif __name__ == '__main__':\n    test()\n",
  "edelweiss": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__   = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nimport re\nimport time\nfrom threading import Thread\n\ntry:\n    from queue import Empty, Queue\nexcept ImportError:\n    from Queue import Empty, Queue\n\nfrom calibre import as_unicode, random_user_agent\nfrom calibre.ebooks.metadata import check_isbn\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\ndef clean_html(raw):\n    from calibre.ebooks.chardet import xml_to_unicode\n    from calibre.utils.cleantext import clean_ascii_chars\n    return clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,\n                                resolve_entities=True, assume_utf8=True)[0])\n\n\ndef parse_html(raw):\n    raw = clean_html(raw)\n    from html5_parser import parse\n    return parse(raw)\n\n\ndef astext(node):\n    from lxml import etree\n    return etree.tostring(node, method='text', encoding='unicode',\n                          with_tail=False).strip()\n\n\nclass Worker(Thread):  # {{{\n\n    def __init__(self, basic_data, relevance, result_queue, br, timeout, log, plugin):\n        Thread.__init__(self)\n        self.daemon = True\n        self.basic_data = basic_data\n        self.br, self.log, self.timeout = br, log, timeout\n        self.result_queue, self.plugin, self.sku = result_queue, plugin, self.basic_data['sku']\n        self.relevance = relevance\n\n    def run(self):\n        url = ('https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/product/two_Enhanced.ascx&'\n        'sku={0}&idPrefix=content_1_{0}&mode=0'.format(self.sku))\n        try:\n            raw = self.br.open_novisit(url, timeout=self.timeout).read()\n        except:\n            self.log.exception('Failed to load comments page: %r'%url)\n            return\n\n        try:\n            mi = self.parse(raw)\n            mi.source_relevance = self.relevance\n            self.plugin.clean_downloaded_metadata(mi)\n            self.result_queue.put(mi)\n        except:\n            self.log.exception('Failed to parse details for sku: %s'%self.sku)\n\n    def parse(self, raw):\n        from calibre.ebooks.metadata.book.base import Metadata\n        from calibre.utils.date import UNDEFINED_DATE\n        root = parse_html(raw)\n        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])\n\n        # Identifiers\n        if self.basic_data['isbns']:\n            mi.isbn = self.basic_data['isbns'][0]\n        mi.set_identifier('edelweiss', self.sku)\n\n        # Tags\n        if self.basic_data['tags']:\n            mi.tags = self.basic_data['tags']\n            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]\n\n        # Publisher\n        mi.publisher = self.basic_data['publisher']\n\n        # Pubdate\n        if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:\n            mi.pubdate = self.basic_data['pubdate']\n\n        # Rating\n        if self.basic_data['rating']:\n            mi.rating = self.basic_data['rating']\n\n        # Comments\n        comments = ''\n        for cid in ('summary', 'contributorbio', 'quotes_reviews'):\n            cid = 'desc_{}{}-content'.format(cid, self.sku)\n            div = root.xpath('//*[@id=\"{}\"]'.format(cid))\n            if div:\n                comments += self.render_comments(div[0])\n        if comments:\n            mi.comments = comments\n\n        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None\n        return mi\n\n    def render_comments(self, desc):\n        from lxml import etree\n\n        from calibre.library.comments import sanitize_comments_html\n        for c in desc.xpath('descendant::noscript'):\n            c.getparent().remove(c)\n        for a in desc.xpath('descendant::a[@href]'):\n            del a.attrib['href']\n            a.tag = 'span'\n        desc = etree.tostring(desc, method='html', encoding='unicode').strip()\n\n        # remove all attributes from tags\n        desc = re.sub(r'<([a-zA-Z0-9]+)\\s[^>]+>', r'<\\1>', desc)\n        # Collapse whitespace\n        # desc = re.sub(r'\\n+', '\\n', desc)\n        # desc = re.sub(r' +', ' ', desc)\n        # Remove comments\n        desc = re.sub(r'(?s)<!--.*?-->', '', desc)\n        return sanitize_comments_html(desc)\n# }}}\n\n\ndef get_basic_data(browser, log, *skus):\n    from mechanize import Request\n\n    from calibre.utils.date import parse_only_date\n    zeroes = ','.join('0' for sku in skus)\n    data = {\n            'skus': ','.join(skus),\n            'drc': zeroes,\n            'startPosition': '0',\n            'sequence': '1',\n            'selected': zeroes,\n            'itemID': '0',\n            'orderID': '0',\n            'mailingID': '',\n            'tContentWidth': '926',\n            'originalOrder': ','.join(type('')(i) for i in range(len(skus))),\n            'selectedOrderID': '0',\n            'selectedSortColumn': '0',\n            'listType': '1',\n            'resultType': '32',\n            'blockView': '1',\n    }\n    items_data_url = 'https://www.edelweiss.plus/GetTreelineControl.aspx?controlName=/uc/listviews/ListView_Title_Multi.ascx'\n    req = Request(items_data_url, data)\n    response = browser.open_novisit(req)\n    raw = response.read()\n    root = parse_html(raw)\n    for item in root.xpath('//div[@data-priority]'):\n        row = item.getparent().getparent()\n        sku = item.get('id').split('-')[-1]\n        isbns = [x.strip() for x in row.xpath('descendant::*[contains(@class, \"pev_sku\")]/text()')[0].split(',') if check_isbn(x.strip())]\n        isbns.sort(key=len, reverse=True)\n        try:\n            tags = [x.strip() for x in astext(row.xpath('descendant::*[contains(@class, \"pev_categories\")]')[0]).split('/')]\n        except IndexError:\n            tags = []\n        rating = 0\n        for bar in row.xpath('descendant::*[contains(@class, \"bgdColorCommunity\")]/@style'):\n            m = re.search(r'width: (\\d+)px;.*max-width: (\\d+)px', bar)\n            if m is not None:\n                rating = float(m.group(1)) / float(m.group(2))\n                break\n        try:\n            pubdate = parse_only_date(astext(row.xpath('descendant::*[contains(@class, \"pev_shipDate\")]')[0]\n                ).split(':')[-1].split(u'\\xa0')[-1].strip(), assume_utc=True)\n        except Exception:\n            log.exception('Error parsing published date')\n            pubdate = None\n        authors = []\n        for x in [x.strip() for x in row.xpath('descendant::*[contains(@class, \"pev_contributor\")]/@title')]:\n            authors.extend(a.strip() for a in x.split(','))\n        entry = {\n                'sku': sku,\n                'cover': row.xpath('descendant::img/@src')[0].split('?')[0],\n                'publisher': astext(row.xpath('descendant::*[contains(@class, \"headerPublisher\")]')[0]),\n                'title': astext(row.xpath('descendant::*[@id=\"title_{}\"]'.format(sku))[0]),\n                'authors': authors,\n                'isbns': isbns,\n                'tags': tags,\n                'pubdate': pubdate,\n                'format': ' '.join(row.xpath('descendant::*[contains(@class, \"pev_format\")]/text()')).strip(),\n                'rating': rating,\n        }\n        if entry['cover'].startswith('/'):\n            entry['cover'] = None\n        yield entry\n\n\nclass Edelweiss(Source):\n\n    name = 'Edelweiss'\n    version = (2, 0, 1)\n    minimum_calibre_version = (3, 6, 0)\n    description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers')\n\n    capabilities = frozenset(['identify', 'cover'])\n    touched_fields = frozenset([\n        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n        'identifier:isbn', 'identifier:edelweiss', 'rating'])\n    supports_gzip_transfer_encoding = True\n    has_html_comments = True\n\n    @property\n    def user_agent(self):\n        # Pass in an index to random_user_agent() to test with a particular\n        # user agent\n        return random_user_agent(allow_ie=False)\n\n    def _get_book_url(self, sku):\n        if sku:\n            return 'https://www.edelweiss.plus/#sku={}&page=1'.format(sku)\n\n    def get_book_url(self, identifiers):  # {{{\n        sku = identifiers.get('edelweiss', None)\n        if sku:\n            return 'edelweiss', sku, self._get_book_url(sku)\n\n    # }}}\n\n    def get_cached_cover_url(self, identifiers):  # {{{\n        sku = identifiers.get('edelweiss', None)\n        if not sku:\n            isbn = identifiers.get('isbn', None)\n            if isbn is not None:\n                sku = self.cached_isbn_to_identifier(isbn)\n        return self.cached_identifier_to_cover_url(sku)\n    # }}}\n\n    def create_query(self, log, title=None, authors=None, identifiers={}):\n        try:\n            from urllib.parse import urlencode\n        except ImportError:\n            from urllib import urlencode\n        import time\n        BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?'\n        'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&')\n        keywords = []\n        isbn = check_isbn(identifiers.get('isbn', None))\n        if isbn is not None:\n            keywords.append(isbn)\n        elif title:\n            title_tokens = list(self.get_title_tokens(title))\n            if title_tokens:\n                keywords.extend(title_tokens)\n            author_tokens = self.get_author_tokens(authors, only_first_author=True)\n            if author_tokens:\n                keywords.extend(author_tokens)\n        if not keywords:\n            return None\n        params = {\n            'q': (' '.join(keywords)).encode('utf-8'),\n            '_': type('')(int(time.time()))\n        }\n        return BASE_URL+urlencode(params)\n\n    # }}}\n\n    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{\n            identifiers={}, timeout=30):\n        import json\n\n        br = self.browser\n        br.addheaders = [\n            ('Referer', 'https://www.edelweiss.plus/'),\n            ('X-Requested-With', 'XMLHttpRequest'),\n            ('Cache-Control', 'no-cache'),\n            ('Pragma', 'no-cache'),\n        ]\n        if 'edelweiss' in identifiers:\n            items = [identifiers['edelweiss']]\n        else:\n            log.error('Currently Edelweiss returns random books for search queries')\n            return\n            query = self.create_query(log, title=title, authors=authors,\n                    identifiers=identifiers)\n            if not query:\n                log.error('Insufficient metadata to construct query')\n                return\n            log('Using query URL:', query)\n            try:\n                raw = br.open(query, timeout=timeout).read().decode('utf-8')\n            except Exception as e:\n                log.exception('Failed to make identify query: %r'%query)\n                return as_unicode(e)\n            items = re.search(r'window[.]items\\s*=\\s*(.+?);', raw)\n            if items is None:\n                log.error('Failed to get list of matching items')\n                log.debug('Response text:')\n                log.debug(raw)\n                return\n            items = json.loads(items.group(1))\n\n        if (not items and identifiers and title and authors and\n                not abort.is_set()):\n            return self.identify(log, result_queue, abort, title=title,\n                    authors=authors, timeout=timeout)\n\n        if not items:\n            return\n\n        workers = []\n        items = items[:5]\n        for i, item in enumerate(get_basic_data(self.browser, log, *items)):\n            sku = item['sku']\n            for isbn in item['isbns']:\n                self.cache_isbn_to_identifier(isbn, sku)\n            if item['cover']:\n                self.cache_identifier_to_cover_url(sku, item['cover'])\n            fmt = item['format'].lower()\n            if 'audio' in fmt or 'mp3' in fmt:\n                continue  # Audio-book, ignore\n            workers.append(Worker(item, i, result_queue, br.clone_browser(), timeout, log, self))\n\n        if not workers:\n            return\n\n        for w in workers:\n            w.start()\n            # Don't send all requests at the same time\n            time.sleep(0.1)\n\n        while not abort.is_set():\n            a_worker_is_alive = False\n            for w in workers:\n                w.join(0.2)\n                if abort.is_set():\n                    break\n                if w.is_alive():\n                    a_worker_is_alive = True\n            if not a_worker_is_alive:\n                break\n\n    # }}}\n\n    def download_cover(self, log, result_queue, abort,  # {{{\n            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n        cached_url = self.get_cached_cover_url(identifiers)\n        if cached_url is None:\n            log.info('No cached cover found, running identify')\n            rq = Queue()\n            self.identify(log, rq, abort, title=title, authors=authors,\n                    identifiers=identifiers)\n            if abort.is_set():\n                return\n            results = []\n            while True:\n                try:\n                    results.append(rq.get_nowait())\n                except Empty:\n                    break\n            results.sort(key=self.identify_results_keygen(\n                title=title, authors=authors, identifiers=identifiers))\n            for mi in results:\n                cached_url = self.get_cached_cover_url(mi.identifiers)\n                if cached_url is not None:\n                    break\n        if cached_url is None:\n            log.info('No cover found')\n            return\n\n        if abort.is_set():\n            return\n        br = self.browser\n        log('Downloading cover from:', cached_url)\n        try:\n            cdata = br.open_novisit(cached_url, timeout=timeout).read()\n            result_queue.put((self, cdata))\n        except:\n            log.exception('Failed to download cover from:', cached_url)\n    # }}}\n\n\nif __name__ == '__main__':\n    from calibre.ebooks.metadata.sources.test import authors_test, comments_test, pubdate_test, test_identify_plugin, title_test\n    tests = [\n        (  # A title and author search\n         {'title': \"The Husband's Secret\", 'authors':['Liane Moriarty']},\n         [title_test(\"The Husband's Secret\", exact=True),\n          authors_test(['Liane Moriarty'])]\n        ),\n\n        (  # An isbn present in edelweiss\n         {'identifiers':{'isbn': '9780312621360'}, },\n         [title_test('Flame: A Sky Chasers Novel', exact=True),\n          authors_test(['Amy Kathleen Ryan'])]\n        ),\n\n        # Multiple authors and two part title and no general description\n        ({'identifiers':{'edelweiss':'0321180607'}},\n        [title_test('XQuery From the Experts: A Guide to the W3C XML Query Language', exact=True),\n         authors_test([\n            'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez',\n            'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon',\n            'Jim Tivy', 'Philip Wadler']),\n         pubdate_test(2003, 8, 22),\n         comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments)\n        ]),\n    ]\n    start, stop = 0, len(tests)\n\n    tests = tests[start:stop]\n    test_identify_plugin(Edelweiss.name, tests)\n",
  "google": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\n# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport hashlib\nimport os\nimport re\nimport sys\nimport tempfile\nimport time\n\nimport regex\n\ntry:\n    from queue import Empty, Queue\nexcept ImportError:\n    from Queue import Empty, Queue\n\nfrom calibre import as_unicode, prepare_string_for_xml, replace_entities\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.ebooks.metadata import authors_to_string, check_isbn\nfrom calibre.ebooks.metadata.book.base import Metadata\nfrom calibre.ebooks.metadata.sources.base import Source\nfrom calibre.utils.cleantext import clean_ascii_chars\nfrom calibre.utils.localization import canonicalize_lang\n\nNAMESPACES = {\n    'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',\n    'atom': 'http://www.w3.org/2005/Atom',\n    'dc': 'http://purl.org/dc/terms',\n    'gd': 'http://schemas.google.com/g/2005'\n}\n\n\ndef pretty_google_books_comments(raw):\n    raw = replace_entities(raw)\n    # Paragraphs in the comments are removed but whatever software googl uses\n    # to do this does not insert a space so we often find the pattern\n    # word.Capital in the comments which can be used to find paragraph markers.\n    parts = []\n    for x in re.split(r'([a-z)\"”])(\\.)([A-Z(\"“])', raw):\n        if x == '.':\n            parts.append('.</p>\\n\\n<p>')\n        else:\n            parts.append(prepare_string_for_xml(x))\n    raw = '<p>' + ''.join(parts) + '</p>'\n    return raw\n\n\ndef get_details(browser, url, timeout):  # {{{\n    try:\n        raw = browser.open_novisit(url, timeout=timeout).read()\n    except Exception as e:\n        gc = getattr(e, 'getcode', lambda: -1)\n        if gc() != 403:\n            raise\n        # Google is throttling us, wait a little\n        time.sleep(2)\n        raw = browser.open_novisit(url, timeout=timeout).read()\n\n    return raw\n# }}}\n\n\nxpath_cache = {}\n\n\ndef XPath(x):\n    ans = xpath_cache.get(x)\n    if ans is None:\n        from lxml import etree\n        ans = xpath_cache[x] = etree.XPath(x, namespaces=NAMESPACES)\n    return ans\n\n\ndef to_metadata(browser, log, entry_, timeout, running_a_test=False):  # {{{\n    from lxml import etree\n\n    # total_results  = XPath('//openSearch:totalResults')\n    # start_index    = XPath('//openSearch:startIndex')\n    # items_per_page = XPath('//openSearch:itemsPerPage')\n    entry = XPath('//atom:entry')\n    entry_id = XPath('descendant::atom:id')\n    url = XPath('descendant::atom:link[@rel=\"self\"]/@href')\n    creator = XPath('descendant::dc:creator')\n    identifier = XPath('descendant::dc:identifier')\n    title = XPath('descendant::dc:title')\n    date = XPath('descendant::dc:date')\n    publisher = XPath('descendant::dc:publisher')\n    subject = XPath('descendant::dc:subject')\n    description = XPath('descendant::dc:description')\n    language = XPath('descendant::dc:language')\n\n    # print(etree.tostring(entry_, pretty_print=True))\n\n    def get_text(extra, x):\n        try:\n            ans = x(extra)\n            if ans:\n                ans = ans[0].text\n                if ans and ans.strip():\n                    return ans.strip()\n        except:\n            log.exception('Programming error:')\n        return None\n\n    def get_extra_details():\n        raw = get_details(browser, details_url, timeout)\n        if running_a_test:\n            with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:\n                f.write(raw)\n                print('Book details saved to:', f.name, file=sys.stderr)\n        feed = etree.fromstring(\n            xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n            parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n        )\n        return entry(feed)[0]\n\n    if isinstance(entry_, str):\n        google_id = entry_\n        details_url = 'https://www.google.com/books/feeds/volumes/' + google_id\n        extra = get_extra_details()\n        title_ = ': '.join([x.text for x in title(extra)]).strip()\n        authors = [x.text.strip() for x in creator(extra) if x.text]\n    else:\n        id_url = entry_id(entry_)[0].text\n        google_id = id_url.split('/')[-1]\n        details_url = url(entry_)[0]\n        title_ = ': '.join([x.text for x in title(entry_)]).strip()\n        authors = [x.text.strip() for x in creator(entry_) if x.text]\n        if not id_url or not title:\n            # Silently discard this entry\n            return None\n        extra = None\n\n    if not authors:\n        authors = [_('Unknown')]\n    if not title:\n        return None\n    if extra is None:\n        extra = get_extra_details()\n    mi = Metadata(title_, authors)\n    mi.identifiers = {'google': google_id}\n    mi.comments = get_text(extra, description)\n    lang = canonicalize_lang(get_text(extra, language))\n    if lang:\n        mi.language = lang\n    mi.publisher = get_text(extra, publisher)\n\n    # ISBN\n    isbns = []\n    for x in identifier(extra):\n        t = type('')(x.text).strip()\n        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):\n            if t[:5].upper() == 'ISBN:':\n                t = check_isbn(t[5:])\n                if t:\n                    isbns.append(t)\n    if isbns:\n        mi.isbn = sorted(isbns, key=len)[-1]\n    mi.all_isbns = isbns\n\n    # Tags\n    try:\n        btags = [x.text for x in subject(extra) if x.text]\n        tags = []\n        for t in btags:\n            atags = [y.strip() for y in t.split('/')]\n            for tag in atags:\n                if tag not in tags:\n                    tags.append(tag)\n    except:\n        log.exception('Failed to parse tags:')\n        tags = []\n    if tags:\n        mi.tags = [x.replace(',', ';') for x in tags]\n\n    # pubdate\n    pubdate = get_text(extra, date)\n    if pubdate:\n        from calibre.utils.date import parse_date, utcnow\n        try:\n            default = utcnow().replace(day=15)\n            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)\n        except:\n            log.error('Failed to parse pubdate %r' % pubdate)\n\n    # Cover\n    mi.has_google_cover = None\n    for x in extra.xpath(\n        '//*[@href and @rel=\"http://schemas.google.com/books/2008/thumbnail\"]'\n    ):\n        mi.has_google_cover = x.get('href')\n        break\n\n    return mi\n\n# }}}\n\n\nclass GoogleBooks(Source):\n\n    name = 'Google'\n    version = (1, 1, 2)\n    minimum_calibre_version = (2, 80, 0)\n    description = _('Downloads metadata and covers from Google Books')\n\n    capabilities = frozenset({'identify'})\n    touched_fields = frozenset({\n        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',\n        'identifier:isbn', 'identifier:google', 'languages'\n    })\n    supports_gzip_transfer_encoding = True\n    cached_cover_url_is_reliable = False\n\n    GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'\n\n    DUMMY_IMAGE_MD5 = frozenset(\n        ('0de4383ebad0adad5eeb8975cd796657', 'a64fa89d7ebc97075c1d363fc5fea71f')\n    )\n\n    def get_book_url(self, identifiers):  # {{{\n        goog = identifiers.get('google', None)\n        if goog is not None:\n            return ('google', goog, 'https://books.google.com/books?id=%s' % goog)\n    # }}}\n\n    def id_from_url(self, url):  # {{{\n        from polyglot.urllib import parse_qs, urlparse\n        purl = urlparse(url)\n        if purl.netloc == 'books.google.com':\n            q = parse_qs(purl.query)\n            gid = q.get('id')\n            if gid:\n                return 'google', gid[0]\n    # }}}\n\n    def create_query(self, title=None, authors=None, identifiers={}, capitalize_isbn=False):  # {{{\n        try:\n            from urllib.parse import urlencode\n        except ImportError:\n            from urllib import urlencode\n        BASE_URL = 'https://books.google.com/books/feeds/volumes?'\n        isbn = check_isbn(identifiers.get('isbn', None))\n        q = ''\n        if isbn is not None:\n            q += ('ISBN:' if capitalize_isbn else 'isbn:') + isbn\n        elif title or authors:\n\n            def build_term(prefix, parts):\n                return ' '.join('in' + prefix + ':' + x for x in parts)\n\n            title_tokens = list(self.get_title_tokens(title))\n            if title_tokens:\n                q += build_term('title', title_tokens)\n            author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n            if author_tokens:\n                q += ('+' if q else '') + build_term('author', author_tokens)\n\n        if not q:\n            return None\n        if not isinstance(q, bytes):\n            q = q.encode('utf-8')\n        return BASE_URL + urlencode({\n            'q': q,\n            'max-results': 20,\n            'start-index': 1,\n            'min-viewability': 'none',\n        })\n\n    # }}}\n\n    def download_cover(  # {{{\n        self,\n        log,\n        result_queue,\n        abort,\n        title=None,\n        authors=None,\n        identifiers={},\n        timeout=30,\n        get_best_cover=False\n    ):\n        cached_url = self.get_cached_cover_url(identifiers)\n        if cached_url is None:\n            log.info('No cached cover found, running identify')\n            rq = Queue()\n            self.identify(\n                log,\n                rq,\n                abort,\n                title=title,\n                authors=authors,\n                identifiers=identifiers\n            )\n            if abort.is_set():\n                return\n            results = []\n            while True:\n                try:\n                    results.append(rq.get_nowait())\n                except Empty:\n                    break\n            results.sort(\n                key=self.identify_results_keygen(\n                    title=title, authors=authors, identifiers=identifiers\n                )\n            )\n            for mi in results:\n                cached_url = self.get_cached_cover_url(mi.identifiers)\n                if cached_url is not None:\n                    break\n        if cached_url is None:\n            log.info('No cover found')\n            return\n\n        br = self.browser\n        for candidate in (0, 1):\n            if abort.is_set():\n                return\n            url = cached_url + '&zoom={}'.format(candidate)\n            log('Downloading cover from:', cached_url)\n            try:\n                cdata = br.open_novisit(url, timeout=timeout).read()\n                if cdata:\n                    if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:\n                        log.warning('Google returned a dummy image, ignoring')\n                    else:\n                        result_queue.put((self, cdata))\n                        break\n            except Exception:\n                log.exception('Failed to download cover from:', cached_url)\n\n    # }}}\n\n    def get_cached_cover_url(self, identifiers):  # {{{\n        url = None\n        goog = identifiers.get('google', None)\n        if goog is None:\n            isbn = identifiers.get('isbn', None)\n            if isbn is not None:\n                goog = self.cached_isbn_to_identifier(isbn)\n        if goog is not None:\n            url = self.cached_identifier_to_cover_url(goog)\n\n        return url\n\n    # }}}\n\n    def postprocess_downloaded_google_metadata(self, ans, relevance=0):  # {{{\n        if not isinstance(ans, Metadata):\n            return ans\n        ans.source_relevance = relevance\n        goog = ans.identifiers['google']\n        for isbn in getattr(ans, 'all_isbns', []):\n            self.cache_isbn_to_identifier(isbn, goog)\n        if getattr(ans, 'has_google_cover', False):\n            self.cache_identifier_to_cover_url(goog, self.GOOGLE_COVER % goog)\n        if ans.comments:\n            ans.comments = pretty_google_books_comments(ans.comments)\n        self.clean_downloaded_metadata(ans)\n        return ans\n    # }}}\n\n    def get_all_details(  # {{{\n        self,\n        br,\n        log,\n        entries,\n        abort,\n        result_queue,\n        timeout\n    ):\n        from lxml import etree\n        for relevance, i in enumerate(entries):\n            try:\n                ans = self.postprocess_downloaded_google_metadata(to_metadata(br, log, i, timeout, self.running_a_test), relevance)\n                if isinstance(ans, Metadata):\n                    result_queue.put(ans)\n            except Exception:\n                log.exception(\n                    'Failed to get metadata for identify entry:', etree.tostring(i)\n                )\n            if abort.is_set():\n                break\n\n    # }}}\n\n    def identify_via_web_search(  # {{{\n        self,\n        log,\n        result_queue,\n        abort,\n        title=None,\n        authors=None,\n        identifiers={},\n        timeout=30\n    ):\n        from calibre.utils.filenames import ascii_text\n        isbn = check_isbn(identifiers.get('isbn', None))\n        q = []\n        strip_punc_pat = regex.compile(r'[\\p{C}|\\p{M}|\\p{P}|\\p{S}|\\p{Z}]+', regex.UNICODE)\n        google_ids = []\n        check_tokens = set()\n        has_google_id = 'google' in identifiers\n\n        def to_check_tokens(*tokens):\n            for t in tokens:\n                if len(t) < 3:\n                    continue\n                t = t.lower()\n                if t in ('and', 'not', 'the'):\n                    continue\n                yield ascii_text(strip_punc_pat.sub('', t))\n\n        if has_google_id:\n            google_ids.append(identifiers['google'])\n        elif isbn is not None:\n            q.append(isbn)\n        elif title or authors:\n            title_tokens = list(self.get_title_tokens(title))\n            if title_tokens:\n                q += title_tokens\n                check_tokens |= set(to_check_tokens(*title_tokens))\n            author_tokens = list(self.get_author_tokens(authors, only_first_author=True))\n            if author_tokens:\n                q += author_tokens\n                check_tokens |= set(to_check_tokens(*author_tokens))\n        if not q and not google_ids:\n            return None\n        from calibre.ebooks.metadata.sources.update import search_engines_module\n        se = search_engines_module()\n        br = se.google_specialize_browser(se.browser())\n        if not has_google_id:\n            url = se.google_format_query(q, site='books.google.com')\n            log('Making query:', url)\n            r = []\n            root = se.query(br, url, 'google', timeout=timeout, save_raw=r.append)\n            pat = re.compile(r'id=([^&]+)')\n            for q in se.google_parse_results(root, r[0], log=log, ignore_uncached=False):\n                m = pat.search(q.url)\n                if m is None or not q.url.startswith('https://books.google'):\n                    continue\n                google_ids.append(m.group(1))\n\n        if not google_ids and isbn and (title or authors):\n            return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n        found = False\n        seen = set()\n        for relevance, gid in enumerate(google_ids):\n            if gid in seen:\n                continue\n            seen.add(gid)\n            try:\n                ans = to_metadata(br, log, gid, timeout, self.running_a_test)\n                if isinstance(ans, Metadata):\n                    if isbn:\n                        if isbn not in ans.all_isbns:\n                            log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the ISBN:', isbn,\n                                'not in', ' '.join(ans.all_isbns))\n                            continue\n                    elif check_tokens:\n                        candidate = set(to_check_tokens(*self.get_title_tokens(ans.title)))\n                        candidate |= set(to_check_tokens(*self.get_author_tokens(ans.authors)))\n                        if candidate.intersection(check_tokens) != check_tokens:\n                            log('Excluding', ans.title, 'by', authors_to_string(ans.authors), 'as it does not match the query')\n                            continue\n                    ans = self.postprocess_downloaded_google_metadata(ans, relevance)\n                    result_queue.put(ans)\n                    found = True\n            except:\n                log.exception('Failed to get metadata for google books id:', gid)\n            if abort.is_set():\n                break\n        if not found and isbn and (title or authors):\n            return self.identify_via_web_search(log, result_queue, abort, title, authors, {}, timeout)\n    # }}}\n\n    def identify(  # {{{\n        self,\n        log,\n        result_queue,\n        abort,\n        title=None,\n        authors=None,\n        identifiers={},\n        timeout=30\n    ):\n        from lxml import etree\n        entry = XPath('//atom:entry')\n        identifiers = identifiers.copy()\n        br = self.browser\n        if 'google' in identifiers:\n            try:\n                ans = to_metadata(br, log, identifiers['google'], timeout, self.running_a_test)\n                if isinstance(ans, Metadata):\n                    self.postprocess_downloaded_google_metadata(ans)\n                    result_queue.put(ans)\n                    return\n            except Exception:\n                log.exception('Failed to get metadata for Google identifier:', identifiers['google'])\n            del identifiers['google']\n\n        query = self.create_query(\n            title=title, authors=authors, identifiers=identifiers\n        )\n        if not query:\n            log.error('Insufficient metadata to construct query')\n            return\n\n        def make_query(query):\n            log('Making query:', query)\n            try:\n                raw = br.open_novisit(query, timeout=timeout).read()\n            except Exception as e:\n                log.exception('Failed to make identify query: %r' % query)\n                return False, as_unicode(e)\n\n            try:\n                feed = etree.fromstring(\n                    xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],\n                    parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)\n                )\n                return True, entry(feed)\n            except Exception as e:\n                log.exception('Failed to parse identify results')\n                return False, as_unicode(e)\n        ok, entries = make_query(query)\n        if not ok:\n            return entries\n        if not entries and not abort.is_set():\n            log('No results found, doing a web search instead')\n            return self.identify_via_web_search(log, result_queue, abort, title, authors, identifiers, timeout)\n\n        # There is no point running these queries in threads as google\n        # throttles requests returning 403 Forbidden errors\n        self.get_all_details(br, log, entries, abort, result_queue, timeout)\n\n    # }}}\n\n\nif __name__ == '__main__':  # tests {{{\n    # To run these test use:\n    # calibre-debug src/calibre/ebooks/metadata/sources/google.py\n    from calibre.ebooks.metadata.sources.test import authors_test, test_identify_plugin, title_test\n    tests = [\n    ({\n        'identifiers': {'google': 's7NIrgEACAAJ'},\n    }, [title_test('Ride Every Stride', exact=False)]),\n\n    ({\n        'identifiers': {'isbn': '0743273567'},\n        'title': 'Great Gatsby',\n        'authors': ['Fitzgerald']\n    }, [\n        title_test('The great gatsby', exact=True),\n        authors_test(['F. Scott Fitzgerald'])\n    ]),\n\n    ({\n        'title': 'Flatland',\n        'authors': ['Abbott']\n    }, [title_test('Flatland', exact=False)]),\n\n    ({\n        'title': 'The Blood Red Indian Summer: A Berger and Mitry Mystery',\n        'authors': ['David Handler'],\n    }, [title_test('The Blood Red Indian Summer: A Berger and Mitry Mystery')\n    ]),\n\n    ({\n        # requires using web search to find the book\n        'title': 'Dragon Done It',\n        'authors': ['Eric Flint'],\n    }, [\n        title_test('The dragon done it', exact=True),\n        authors_test(['Eric Flint', 'Mike Resnick'])\n    ]),\n\n    ]\n    test_identify_plugin(GoogleBooks.name, tests[:])\n\n# }}}\n",
  "google_images": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__   = 'GPL v3'\n__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom collections import OrderedDict\n\nfrom calibre import random_user_agent\nfrom calibre.ebooks.metadata.sources.base import Option, Source\n\n\ndef parse_html(raw):\n    try:\n        from html5_parser import parse\n    except ImportError:\n        # Old versions of calibre\n        import html5lib\n        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n    else:\n        return parse(raw)\n\n\ndef imgurl_from_id(raw, tbnid):\n    from json import JSONDecoder\n    q = '\"{}\",['.format(tbnid)\n    start_pos = raw.index(q)\n    if start_pos < 100:\n        return\n    jd = JSONDecoder()\n    data = jd.raw_decode('[' + raw[start_pos:])[0]\n    # from pprint import pprint\n    # pprint(data)\n    url_num = 0\n    for x in data:\n        if isinstance(x, list) and len(x) == 3:\n            q = x[0]\n            if hasattr(q, 'lower') and q.lower().startswith('http'):\n                url_num += 1\n                if url_num > 1:\n                    return q\n\n\ndef parse_google_markup(raw):\n    root = parse_html(raw)\n    # newer markup pages use data-docid not data-tbnid\n    results = root.xpath('//div/@data-tbnid') or root.xpath('//div/@data-docid')\n    ans = OrderedDict()\n    for tbnid in results:\n        try:\n            imgurl = imgurl_from_id(raw, tbnid)\n        except Exception:\n            continue\n        if imgurl:\n            ans[imgurl] = True\n    return list(ans)\n\n\nclass GoogleImages(Source):\n\n    name = 'Google Images'\n    version = (1, 0, 6)\n    minimum_calibre_version = (2, 80, 0)\n    description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')\n    capabilities = frozenset(['cover'])\n    can_get_multiple_covers = True\n    supports_gzip_transfer_encoding = True\n    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),\n                      _('The maximum number of covers to process from the Google search result')),\n               Option('size', 'choices', 'svga', _('Cover size'),\n                      _('Search for covers larger than the specified size'),\n                      choices=OrderedDict((\n                          ('any', _('Any size'),),\n                          ('l', _('Large'),),\n                          ('qsvga', _('Larger than %s')%'400x300',),\n                          ('vga', _('Larger than %s')%'640x480',),\n                          ('svga', _('Larger than %s')%'600x800',),\n                          ('xga', _('Larger than %s')%'1024x768',),\n                          ('2mp', _('Larger than %s')%'2 MP',),\n                          ('4mp', _('Larger than %s')%'4 MP',),\n                      ))),\n    )\n\n    def download_cover(self, log, result_queue, abort,\n            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n        if not title:\n            return\n        timeout = max(60, timeout)  # Needs at least a minute\n        title = ' '.join(self.get_title_tokens(title))\n        author = ' '.join(self.get_author_tokens(authors))\n        urls = self.get_image_urls(title, author, log, abort, timeout)\n        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)\n\n    @property\n    def user_agent(self):\n        return random_user_agent(allow_ie=False)\n\n    def get_image_urls(self, title, author, log, abort, timeout):\n        from calibre.utils.cleantext import clean_ascii_chars\n        try:\n            from urllib.parse import urlencode\n        except ImportError:\n            from urllib import urlencode\n        br = self.browser\n        q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')})\n        if isinstance(q, bytes):\n            q = q.decode('utf-8')\n        sz = self.prefs['size']\n        if sz == 'any':\n            sz = ''\n        elif sz == 'l':\n            sz = 'isz:l,'\n        else:\n            sz = 'isz:lt,islt:%s,' % sz\n        # See https://www.google.com/advanced_image_search to understand this\n        # URL scheme\n        url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)\n        log('Search URL: ' + url)\n        # See https://github.com/benbusby/whoogle-search/pull/1054 for cookies\n        br.set_simple_cookie('CONSENT', 'PENDING+987', '.google.com', path='/')\n        template = b'\\x08\\x01\\x128\\x08\\x14\\x12+boq_identityfrontenduiserver_20231107.05_p0\\x1a\\x05en-US \\x03\\x1a\\x06\\x08\\x80\\xf1\\xca\\xaa\\x06'\n        from base64 import standard_b64encode\n        from datetime import date\n        template.replace(b'20231107', date.today().strftime('%Y%m%d').encode('ascii'))\n        br.set_simple_cookie('SOCS', standard_b64encode(template).decode('ascii').rstrip('='), '.google.com', path='/')\n        # br.set_debug_http(True)\n        raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))\n        # with open('/t/raw.html', 'w') as f:\n        #     f.write(raw)\n        return parse_google_markup(raw)\n\n\ndef test_raw():\n    import sys\n    raw = open(sys.argv[-1]).read()\n    for x in parse_google_markup(raw):\n        print(x)\n\n\ndef test(title='Star Trek: Section 31: Control', authors=('David Mack',)):\n    try:\n        from queue import Queue\n    except ImportError:\n        from Queue import Queue\n    from threading import Event\n\n    from calibre.utils.logging import default_log\n    p = GoogleImages(None)\n    p.log = default_log\n    rq = Queue()\n    p.download_cover(default_log, rq, Event(), title=title, authors=authors)\n    print('Downloaded', rq.qsize(), 'covers')\n\n\nif __name__ == '__main__':\n    test()\n",
  "hashes": {
    "amazon": "cb6b4178d198ae60ab1017e03a45d9d839899057",
    "big_book_search": "7a8b67c0f19ecbfe8a9d28b961aab1119f31c3e3",
    "edelweiss": "54f2d2d6d00d4a7081e72d08d8b7b4bb4288cb53",
    "google": "d7688a11f00e15ed8f9786e97cc74fe9184b9300",
    "google_images": "4244dd8267cb6215c7dfd2da166c6e02b1db31ea",
    "openlibrary": "239077a692701cbf0281e7a2e64306cd00217410",
    "search_engines": "9f1dbe2c712c5944b63f700dd8831b9c18231039"
  },
  "openlibrary": "#!/usr/bin/env python\n# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\n__license__   = 'GPL v3'\n__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'\n__docformat__ = 'restructuredtext en'\n\nfrom calibre.ebooks.metadata.sources.base import Source\n\n\nclass OpenLibrary(Source):\n\n    name = 'Open Library'\n    version = (1, 0, 2)\n    minimum_calibre_version = (2, 80, 0)\n    description = _('Downloads covers from The Open Library')\n\n    capabilities = frozenset(['cover'])\n\n    OPENLIBRARY = 'https://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'\n\n    def download_cover(self, log, result_queue, abort,\n            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):\n        if 'isbn' not in identifiers:\n            return\n        isbn = identifiers['isbn']\n        br = self.browser\n        try:\n            ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()\n            result_queue.put((self, ans))\n        except Exception as e:\n            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:\n                log.error('No cover for ISBN: %r found'%isbn)\n            else:\n                log.exception('Failed to download cover for ISBN:', isbn)\n",
  "search_engines": "#!/usr/bin/env python\n# vim:fileencoding=utf-8\n# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>\n\nfrom __future__ import absolute_import, division, print_function, unicode_literals\n\nimport json\nimport os\nimport re\nimport sys\nimport time\nfrom collections import namedtuple\nfrom contextlib import contextmanager\nfrom functools import partial\nfrom threading import Lock\n\ntry:\n    from urllib.parse import parse_qs, quote, quote_plus, urlencode, urlparse\nexcept ImportError:\n    from urllib import quote, quote_plus, urlencode\n\n    from urlparse import parse_qs, urlparse\n\nfrom lxml import etree\n\nfrom calibre import browser as _browser\nfrom calibre import prints as safe_print\nfrom calibre import random_user_agent\nfrom calibre.constants import cache_dir\nfrom calibre.ebooks.chardet import xml_to_unicode\nfrom calibre.utils.lock import ExclusiveFile\nfrom calibre.utils.random_ua import accept_header_for_ua\n\ncurrent_version = (1, 2, 14)\nminimum_calibre_version = (2, 80, 0)\nwebcache = {}\nwebcache_lock = Lock()\nprints = partial(safe_print, file=sys.stderr)\n\n\nResult = namedtuple('Result', 'url title cached_url')\n\n\n@contextmanager\ndef rate_limit(name='test', time_between_visits=2, max_wait_seconds=5 * 60, sleep_time=0.2):\n    lock_file = os.path.join(cache_dir(), 'search-engine.' + name + '.lock')\n    with ExclusiveFile(lock_file, timeout=max_wait_seconds, sleep_time=sleep_time) as f:\n        try:\n            lv = float(f.read().decode('utf-8').strip())\n        except Exception:\n            lv = 0\n        # we cannot use monotonic() as this is cross process and historical\n        # data as well\n        delta = time.time() - lv\n        if delta < time_between_visits:\n            time.sleep(time_between_visits - delta)\n        try:\n            yield\n        finally:\n            f.seek(0)\n            f.truncate()\n            f.write(repr(time.time()).encode('utf-8'))\n\n\ndef tostring(elem):\n    return etree.tostring(elem, encoding='unicode', method='text', with_tail=False)\n\n\ndef browser():\n    ua = random_user_agent(allow_ie=False)\n    # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'\n    br = _browser(user_agent=ua)\n    br.set_handle_gzip(True)\n    br.addheaders += [\n        ('Accept', accept_header_for_ua(ua)),\n        ('Upgrade-insecure-requests', '1'),\n    ]\n    return br\n\n\ndef encode_query(**query):\n    q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.items()}\n    return urlencode(q).decode('utf-8')\n\n\ndef parse_html(raw):\n    try:\n        from html5_parser import parse\n    except ImportError:\n        # Old versions of calibre\n        import html5lib\n        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)\n    else:\n        return parse(raw)\n\n\ndef query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):\n    with rate_limit(key):\n        if simple_scraper is None:\n            raw = br.open_novisit(url, timeout=timeout).read()\n            raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]\n        else:\n            raw = simple_scraper(url, timeout=timeout)\n    if dump_raw is not None:\n        with open(dump_raw, 'w') as f:\n            f.write(raw)\n    if save_raw is not None:\n        save_raw(raw)\n    return parser(raw)\n\n\ndef quote_term(x):\n    ans = quote_plus(x.encode('utf-8'))\n    if isinstance(ans, bytes):\n        ans = ans.decode('utf-8')\n    return ans\n\n\n# DDG + Wayback machine {{{\n\ndef ddg_url_processor(url):\n    return url\n\n\ndef ddg_term(t):\n    t = t.replace('\"', '')\n    if t.lower() in {'map', 'news'}:\n        t = '\"' + t + '\"'\n    if t in {'OR', 'AND', 'NOT'}:\n        t = t.lower()\n    return t\n\n\ndef ddg_href(url):\n    if url.startswith('/'):\n        q = url.partition('?')[2]\n        url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8')\n    return url\n\n\ndef wayback_machine_cached_url(url, br=None, log=prints, timeout=60):\n    q = quote_term(url)\n    br = br or browser()\n    try:\n        data = query(br, 'https://archive.org/wayback/available?url=' +\n                    q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)\n    except Exception as e:\n        log('Wayback machine query failed for url: ' + url + ' with error: ' + str(e))\n        return None\n    try:\n        closest = data['archived_snapshots']['closest']\n        if closest['available']:\n            ans = closest['url'].replace('http:', 'https:', 1)\n            # get unmodified HTML\n            ans = ans.replace(closest['timestamp'], closest['timestamp'] + 'id_', 1)\n            return ans\n    except Exception:\n        pass\n    from pprint import pformat\n    log('Response from wayback machine:', pformat(data))\n\n\ndef wayback_url_processor(url):\n    if url.startswith('/'):\n        # Use original URL instead of absolutizing to wayback URL as wayback is\n        # slow\n        m = re.search(r'https?:', url)\n        if m is None:\n            url = 'https://web.archive.org' + url\n        else:\n            url = url[m.start():]\n    return url\n\n\nddg_scraper_storage = []\n\n\ndef ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):\n    # https://duck.co/help/results/syntax\n    terms = [quote_term(ddg_term(t)) for t in terms]\n    if site is not None:\n        terms.append(quote_term(('site:' + site)))\n    q = '+'.join(terms)\n    url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(\n        q=q, kp=1 if safe_search else -1)\n    log('Making ddg query: ' + url)\n    from calibre.scraper.simple import read_url\n    br = br or browser()\n    root = query(br, url, 'ddg', dump_raw, timeout=timeout, simple_scraper=partial(read_url, ddg_scraper_storage))\n    ans = []\n    for a in root.xpath('//*[@class=\"results\"]//*[@class=\"result__title\"]/a[@href and @class=\"result__a\"]'):\n        try:\n            ans.append(Result(ddg_href(a.get('href')), tostring(a), None))\n        except KeyError:\n            log('Failed to find ddg href in:', a.get('href'))\n    return ans, url\n\n\ndef ddg_develop():\n    br = browser()\n    for result in ddg_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:\n        if '/dp/' in result.url:\n            print(result.title)\n            print(' ', result.url)\n            print(' ', get_cached_url(result.url, br))\n            print()\n# }}}\n\n\n# Bing {{{\n\ndef bing_term(t):\n    t = t.replace('\"', '')\n    if t in {'OR', 'AND', 'NOT'}:\n        t = t.lower()\n    return t\n\n\ndef bing_url_processor(url):\n    return url\n\n\ndef resolve_bing_wrapper_page(url, br, log):\n    raw = br.open_novisit(url).read().decode('utf-8', 'replace')\n    m = re.search(r'var u = \"(.+)\"', raw)\n    if m is None:\n        log('Failed to resolve bing wrapper page for url: ' + url)\n        return url\n    log('Resolved bing wrapped URL: ' + url + ' to ' + m.group(1))\n    return m.group(1)\n\n\nbing_scraper_storage = []\n\n\ndef bing_search(\n    terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60,\n    show_user_agent=False, result_url_is_ok=lambda x: True\n):\n    # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm\n    terms = [quote_term(bing_term(t)) for t in terms]\n    if site is not None:\n        terms.append(quote_term(('site:' + site)))\n    q = '+'.join(terms)\n    url = 'https://www.bing.com/search?q={q}'.format(q=q)\n    log('Making bing query: ' + url)\n    from calibre.scraper.simple import read_url\n    root = query(br, url, 'bing', dump_raw, timeout=timeout, simple_scraper=partial(read_url, bing_scraper_storage))\n    ans = []\n    result_items = root.xpath('//*[@id=\"b_results\"]/li[@class=\"b_algo\"]')\n    if not result_items:\n        log('Bing returned no results')\n        return ans, url\n    for li in result_items:\n        a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class=\"b_algoheader\"]/a[@href]')\n        a = a[0]\n        title = tostring(a)\n        ans_url = a.get('href')\n        if ans_url.startswith('https://www.bing.com/'):\n            ans_url = resolve_bing_wrapper_page(ans_url, br, log)\n        if result_url_is_ok(ans_url):\n            ans.append(Result(ans_url, title, None))\n    if not ans:\n        title = ' '.join(root.xpath('//title/text()'))\n        log('Failed to find any results on results page, with title:', title)\n    return ans, url\n\n\ndef bing_develop(terms='heroes abercrombie'):\n    if isinstance(terms, str):\n        terms = terms.split()\n    for result in bing_search(terms, 'www.amazon.com', dump_raw='/t/raw.html', show_user_agent=True)[0]:\n        if '/dp/' in result.url:\n            print(result.title)\n            print(' ', result.url)\n            print(' ', result.cached_url)\n            print()\n# }}}\n\n\n# Google {{{\n\ndef google_term(t):\n    t = t.replace('\"', '')\n    if t in {'OR', 'AND', 'NOT'}:\n        t = t.lower()\n    return t\n\n\ndef google_url_processor(url):\n    return url\n\n\ndef google_cache_url_for_url(url):\n    if not isinstance(url, bytes):\n        url = url.encode('utf-8')\n    cu = quote(url, safe='')\n    if isinstance(cu, bytes):\n        cu = cu.decode('utf-8')\n    return 'https://webcache.googleusercontent.com/search?q=cache:' + cu\n\n\ndef google_get_cached_url(url, br=None, log=prints, timeout=60):\n    # Google's webcache was discontinued in september 2024\n    cached_url = google_cache_url_for_url(url)\n    br = google_specialize_browser(br or browser())\n    try:\n        raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)\n    except Exception as err:\n        log('Failed to get cached URL from google for URL: {} with error: {}'.format(url, err))\n    else:\n        with webcache_lock:\n            webcache[cached_url] = raw\n        return cached_url\n\n\ndef canonicalize_url_for_cache_map(url):\n    try:\n        purl = urlparse(url)\n    except Exception:\n        return url\n    if '.amazon.' in purl.netloc:\n        url = url.split('&', 1)[0]\n    return url\n\n\ndef google_parse_results(root, raw, log=prints, ignore_uncached=True):\n    ans = []\n    seen = set()\n    for a in root.xpath('//a[@href]'):\n        href = a.get('href')\n        if not href.startswith('/url?q=http'):\n            continue\n        try:\n            url = parse_qs(urlparse(href).query)['q'][0]\n            purl = urlparse(url)\n        except Exception:\n            continue\n        if 'google.com' in purl.netloc:\n            continue\n        try:\n            title = tostring(next(a.iterchildren('span')))\n        except StopIteration:\n            continue\n        curl = canonicalize_url_for_cache_map(url)\n        if curl in seen:\n            continue\n        seen.add(curl)\n        ans.append(Result(curl, title, None))\n    if not ans:\n        title = ' '.join(root.xpath('//title/text()'))\n        log('Failed to find any results on results page, with title:', title)\n    return ans\n\n\ndef google_consent_cookies():\n    # See https://github.com/benbusby/whoogle-search/pull/1054 for cookies\n    from base64 import standard_b64encode\n    from datetime import date\n    base = {'domain': '.google.com', 'path': '/'}\n    b = base.copy()\n    b['name'], b['value'] = 'CONSENT', 'PENDING+987'\n    yield b\n    template = b'\\x08\\x01\\x128\\x08\\x14\\x12+boq_identityfrontenduiserver_20231107.05_p0\\x1a\\x05en-US \\x03\\x1a\\x06\\x08\\x80\\xf1\\xca\\xaa\\x06'\n    template.replace(b'20231107', date.today().strftime('%Y%m%d').encode('ascii'))\n    b = base.copy()\n    b['name'], b['value'] = 'SOCS', standard_b64encode(template).decode('ascii').rstrip('=')\n    yield b\n\n\ndef google_specialize_browser(br):\n    with webcache_lock:\n        if not hasattr(br, 'google_consent_cookie_added'):\n            for c in google_consent_cookies():\n                br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path'])\n            br.google_consent_cookie_added = True\n    # google serves JS based pages without the right user agent\n    br.set_user_agent('L''y''nx''/2.''8.''6rel''.5 lib''ww''w-F''M/2.''1''4')  # noqa\n    return br\n\n\ndef is_probably_book_asin(t):\n    return t and len(t) == 10 and t.startswith('B') and t.upper() == t\n\n\ndef is_asin_or_isbn(t):\n    from calibre.ebooks.metadata import check_isbn\n    return bool(check_isbn(t) or is_probably_book_asin(t))\n\n\ndef google_format_query(terms, site=None, tbm=None):\n    prevent_spelling_correction = False\n    for t in terms:\n        if is_asin_or_isbn(t):\n            prevent_spelling_correction = True\n            break\n    terms = [quote_term(google_term(t)) for t in terms]\n    if site is not None:\n        terms.append(quote_term(('site:' + site)))\n    q = '+'.join(terms)\n    url = 'https://www.google.com/search?q={q}'.format(q=q)\n    # tbm causes 403 forbidden errors\n    # if tbm:\n    #     url += '&tbm=' + tbm\n    if prevent_spelling_correction:\n        url += '&nfpr=1'\n    return url\n\n\ndef google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):\n    url = google_format_query(terms, site)\n    log('Making google query: ' + url)\n    br = google_specialize_browser(br or browser())\n    r = []\n    root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append)\n    return google_parse_results(root, r[0], log=log), url\n\n\ndef google_develop(search_terms='1423146786', raw_from=''):\n    if raw_from:\n        with open(raw_from, 'rb') as f:\n            raw = f.read()\n        results = google_parse_results(parse_html(raw), raw)\n    else:\n        br = browser()\n        results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]\n    for result in results:\n        if '/dp/' in result.url:\n            print(result.title)\n            print(' ', result.url)\n            print(' ', result.cached_url)\n            print()\n# }}}\n\n\n# Yandex {{{\ndef yandex_term(t):\n    t = t.replace('\"', '')\n    if t in {'OR', 'AND', 'NOT'}:\n        t = t.lower()\n    return t\n\n\ndef yandex_format_query(terms, site=None):\n    terms = [quote_term(yandex_term(t)) for t in terms]\n    if site is not None:\n        terms.append(quote_term(('site:' + site)))\n    q = '+'.join(terms)\n    url = 'https://yandex.com/search?text={q}'.format(q=q)\n    return url\n\n\ndef yandex_parse_results(root, raw, log=prints, ignore_uncached=True):\n    pass\n\n\nyandex_scraper_storage = []\n\n\ndef yandex_search(terms, site=None, br=None, dump_raw=None, log=prints, timeout=60):\n    # Sadly yandex uses CAPTCHAs aggresively\n    url = yandex_format_query(terms, site)\n    br = browser()\n    r = []\n    from calibre.scraper.simple import read_url\n    root = query(br, url, 'yandex', dump_raw, timeout=timeout, save_raw=r.append, simple_scraper=partial(read_url, yandex_scraper_storage))\n    return yandex_parse_results(root, r[0], log=log), url\n\n\ndef yandex_develop(search_terms='1423146786', raw_from=''):\n    if raw_from:\n        with open(raw_from, 'rb') as f:\n            raw = f.read()\n        results = yandex_parse_results(parse_html(raw), raw)\n    else:\n        results = yandex_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html')[0]\n    for result in results:\n        if '/dp/' in result.url:\n            print(result.title)\n            print(' ', result.url)\n            print(' ', result.cached_url)\n            print()\n\n# }}}\n\n\ndef get_cached_url(url, br=None, log=prints, timeout=60):\n    from threading import Lock, Thread\n\n    from polyglot.queue import Queue\n    print_lock = Lock()\n    q = Queue()\n\n    def safe_print(*a):\n        with print_lock:\n            log(*a)\n\n    def doit(func):\n        try:\n            q.put(func(url, br, safe_print, timeout))\n        except Exception as e:\n            safe_print(e)\n        q.put(None)\n\n    threads = []\n    threads.append(Thread(target=doit, args=(wayback_machine_cached_url,), daemon=True).start())\n    while threads:\n        x = q.get()\n        if x is not None:\n            return x\n        threads.pop()\n\n\ndef get_data_for_cached_url(url):\n    with webcache_lock:\n        return webcache.get(url)\n\n\ndef resolve_url(url):\n    prefix, rest = url.partition(':')[::2]\n    if prefix == 'bing':\n        return bing_url_processor(rest)\n    if prefix == 'wayback':\n        return wayback_url_processor(rest)\n    return url\n\n\n# if __name__ == '__main__':\n#     import sys\n#     func = sys.argv[-1]\n#     globals()[func]()\n"
}
 |