From 238627537deafd51f41c929747c041d193e66ab9 Mon Sep 17 00:00:00 2001 From: Trygve Laugstøl Date: Thu, 2 Aug 2018 23:34:45 +0200 Subject: o Handling digikey pages with a special "exact match" header + list of other categories to continue to search in. The header is just enough data for us. o Better digikey tests, saving local HTML instead of doing online requests. --- src/ee/digikey/__init__.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'src/ee/digikey') diff --git a/src/ee/digikey/__init__.py b/src/ee/digikey/__init__.py index 6fa3161..f230f00 100644 --- a/src/ee/digikey/__init__.py +++ b/src/ee/digikey/__init__.py @@ -213,16 +213,11 @@ class DigikeyClient(object): def __nop(self, message): pass - def __init__(self, digikey: Digikey, on_download=None): + def __init__(self, digikey: Digikey, cache_dir=None, on_download=None): self.digikey = digikey self.on_download = on_download or self.__nop - cache = FileCache('digikey_cache', forever=True) - self.sess = CacheControl(requests.Session(), cache=cache, heuristic=ExpiresAfter(days=1)) - - # adapter = CacheControlAdapter(cache=cache, heuristic=ExpiresAfter(days=1)) - # self.sess = requests.Session() - # self.sess.mount('http://', adapter) - # self.sess.mount('https://', adapter) + cache = FileCache(cache_dir or 'http_cache', forever=True) + self.sess = CacheControl(requests.Session(), cache=cache, heuristic=ExpiresAfter(days=10*365)) def _req(self, url, params=None): if not url.startswith("http"): @@ -274,8 +269,8 @@ class DigikeyClient(object): return None - # noinspection PyMethodMayBeStatic - def _search_process_multiple_results(self, tree: html, res: DigikeySearchResponse): + @staticmethod + def _handle_product_table(tree: html, res: DigikeySearchResponse): products = tree.xpath("//*[@itemtype='http://schema.org/Product']") for product in products: @@ -290,24 +285,43 @@ class DigikeyClient(object): return len(products) + @staticmethod + def _handle_exact_part_list(tree: html, res: DigikeySearchResponse): + products = tree.xpath(".//tr[@class='exactPart']") + + for product in products: + a = _first((a for a in product.xpath(".//td/span/a[@href]"))) + part_number = _first(product.xpath(".//span[last()]")) + + if a is not None and part_number is not None: + url = a.get("href") + mpn = a.text + res.append(DigikeyProduct(part_number.text, mpn, url)) + + return len(products) + def search(self, query: str, page_size=10) -> DigikeySearchResponse: - # http://www.digikey.com/products/en?x=0&y=0&lang=en&site=us&keywords=553-2320-1-ND - # params = {'lang': 'en', 'site': 'us', 'keywords': query, 'pageSize': str(page_size), 'x': 0, 'y': 0} params = {'lang': 'en', 'site': 'us', 'keywords': query, 'pageSize': str(page_size)} page = self._req("https://www.digikey.com/products/en", params=params) - # print("page: ") - # print(page.content) - tree = html.fromstring(page.content) + return self.parse_string(page.content) + + def parse_string(self, page_content: str): + tree = html.fromstring(page_content) count = _first([_parse_int(e.text) for e in tree.xpath("//span[@id='matching-records-count']") if e.text]) if count: product_table = _first(tree.xpath("//table[@id='productTable']")) + exact_part_list = _first(tree.xpath("//table[@id='exactPartList']")) if product_table is not None: res = DigikeySearchResponse(count, SearchResponseTypes.MANY) - self._search_process_multiple_results(product_table, res) + self._handle_product_table(product_table, res) + return res + elif exact_part_list is not None: + res = DigikeySearchResponse(count, SearchResponseTypes.MANY) + self._handle_exact_part_list(exact_part_list, res) return res else: # If the search matches multiple product categories the user has to select the appropriate category -- cgit v1.2.3