From 16ccbfdc70f9407b0bd600fe600e98ecfae7f198 Mon Sep 17 00:00:00 2001 From: Trygve Laugstøl Date: Thu, 23 May 2019 13:20:02 +0200 Subject: digikey: Better discovery of form for searching. --- src/ee/digikey/__init__.py | 60 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 10 deletions(-) (limited to 'src/ee/digikey/__init__.py') diff --git a/src/ee/digikey/__init__.py b/src/ee/digikey/__init__.py index 87860b9..3eb723e 100644 --- a/src/ee/digikey/__init__.py +++ b/src/ee/digikey/__init__.py @@ -14,6 +14,7 @@ from lxml import html from selenium import webdriver import ee._utils +from ee import EeException from ee.money import Money, get_default_context from ee.tools import mk_parents @@ -63,19 +64,21 @@ def _first(collection, default=None): class DigikeyStore(object): BASEURL = "http://purl.org/ee/supplier/digikey" - def __init__(self, url, store, products_url): + def __init__(self, url, store, frontpage_url, products_url): self.url = url self.store = store + self.frontpage_url = frontpage_url self.products_url = products_url @staticmethod def from_store_code(store_code): url = "{}?store={}".format(DigikeyStore.BASEURL, store_code) - products_url = "https://www.digikey.com/products/en" if store_code == "us" else \ - "https://www.digikey.{}/products/en".format(store_code) + frontpage_url = "https://www.digikey.com" if store_code == "us" else \ + "https://www.digikey.{}".format(store_code) + products_url = "{}/products/en".format(frontpage_url) - return DigikeyStore(url, store_code, products_url) + return DigikeyStore(url, store_code, frontpage_url, products_url) @staticmethod def from_url(store_url) -> Optional["DigikeyStore"]: @@ -256,6 +259,7 @@ class SearchResponseTypes(enum.Enum): SINGLE = 2 # A product page was returned TOO_MANY = 3 # A listing of categories was given, the user is expected to narrow down the search NO_MATCHES = 4 + EXCEPTION = 5 class DigikeySearchResponse(object): @@ -277,15 +281,42 @@ class DigikeyClient(object): self.baseurl = baseurl self.on_download = on_download or self.__nop self.cache = ee._utils.maybe_cache(cache_dir) - self.driver: Optional[webdriver.Chrome] = None + self._driver: Optional[webdriver.Chrome] = None def search(self, query: str, page_size=10) -> str: return self.product_search(query, page_size) + def _find_first_visible(self, xpath): + return next((e for e in self.driver.find_elements_by_xpath(xpath) if e.is_displayed()), None) + def product_search(self, query: str, page_size=10) -> str: params = {'lang': 'en', 'site': 'us', 'keywords': query, 'pageSize': str(page_size)} cache_key = urllib.parse.quote(query) - page = self._req(self.baseurl, cache_key=cache_key, params=params) + + cached = self.cache.lookup(cache_key) + if cached: + self.on_download("Using cached {}".format(cache_key)) + return cached + + def find_form(): + return self._find_first_visible(".//input[@name='keywords']"), \ + self._find_first_visible("//*[@id='header-search-button']") + + form_input, form_button = find_form() + if not form_input or not form_button: + self.driver.get(self.baseurl) + + form_input, form_button = find_form() + + if not form_input or not form_button: + raise EeException("Could not find form") + + form_input.send_keys(query) + form_button.click() + + page = self.driver.page_source + + self.cache.save(cache_key, page) return page @@ -299,10 +330,6 @@ class DigikeyClient(object): self.on_download("Downloading {}".format(url)) - if self.driver is None: - options = webdriver.ChromeOptions() - self.driver = webdriver.Chrome(chrome_options=options) - self.driver.get(url) src = self.driver.page_source @@ -316,6 +343,14 @@ class DigikeyClient(object): def get(self, url, cache_key, params=None): return self._req(url, cache_key, params) + @property + def driver(self) -> webdriver.Chrome: + if self._driver is None: + options = webdriver.ChromeOptions() + self._driver = webdriver.Chrome(chrome_options=options) + + return self._driver + class DigikeyParser(object): def __init__(self, digikey: Digikey): @@ -468,6 +503,11 @@ class DigikeyParser(object): def parse_string(self, origin_url, page_content: str): tree = html.fromstring(page_content) + fail = _first(tree.xpath("//h1[text()='403']")) + + if fail is not None: + return DigikeySearchResponse(0, SearchResponseTypes.EXCEPTION) + count = _first([_parse_int(e.text) for e in tree.xpath("//span[@id='matching-records-count']") if e.text]) if count: -- cgit v1.2.3