Skip to content

Commit 17dc6ef

Browse files
committed
w
1 parent 23e82b9 commit 17dc6ef

4 files changed

Lines changed: 23 additions & 11 deletions

File tree

_freeze/other/2 web scrap/3 web scrap advance/execute-results/html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"hash": "305af435316331b72089721c1f0d43bb",
33
"result": {
44
"engine": "jupyter",
5-
"markdown": "---\ntitle: \"web scrap with advance\"\nauthor: \"Tony Duan\"\n\nexecute:\n warning: false\n error: false\n eval: false\n\nformat:\n html:\n toc: true\n toc-location: right\n code-fold: show\n code-tools: true\n number-sections: true\n code-block-bg: true\n code-block-border-left: \"#31BAE9\"\n---\n\n::: {#920f4433 .cell execution_count=1}\n``` {.python .cell-code}\nfrom seleniumbase import SB\n\nwith SB(test=True, uc=True) as sb:\n sb.open(\"https://google.com/ncr\")\n sb.type('[title=\"Search\"]', \"SeleniumBase GitHub page\\n\")\n sb.click('[href*=\"github.com/seleniumbase/\"]')\n sb.save_screenshot_to_logs() # ./latest_logs/\n print(sb.get_page_title())\n```\n:::\n\n\n::: {#f412b7cd .cell execution_count=2}\n``` {.python .cell-code}\nfrom seleniumbase import SB\n\nwith SB(test=True, uc=True) as sb:\n sb.open(\"https://www.whiskybase.com/whiskies/\")\n #sb.type('[title=\"Search\"]', \"SeleniumBase GitHub page\\n\")\n #sb.click('[href*=\"github.com/seleniumbase/\"]')\n sb.save_screenshot_to_logs() # ./latest_logs/\n print(sb.get_page_title())\n```\n:::\n\n\n::: {#72338001 .cell execution_count=3}\n``` {.python .cell-code}\nfrom seleniumbase import Driver\nclass Scraper(BaseCase):\ndef test_bypass_bot_protection(self):\ndriver = Driver(uc=True)\ndriver.open(\"https://www.whiskybase.com/whiskies/whisky/268484/2009-ud\")\ndriver.uc_gui_click_captcha()\npage_html = driver.get_page_source()\nprint(page_html)\ndriver.quit()\n```\n:::\n\n\n::: {#c5151c77 .cell execution_count=4}\n``` {.python .cell-code}\nfrom bs4 import BeautifulSoup\nsoup = BeautifulSoup(page_html, 'html.parser')\nprint(soup.prettify())\n```\n:::\n\n\n::: {#ac55c2eb .cell execution_count=5}\n``` {.python .cell-code}\nprint(soup.get_text())\n```\n:::\n\n\n::: {#326d7d98 .cell execution_count=6}\n``` {.python .cell-code}\nsoup.select(\".votes-rating-current\").text\n```\n:::\n\n\nh1\n\n# Reference:\n\nhttps://github.com/seleniumbase/SeleniumBase\n\nhttps://medium.com/@datajournal/web-scraping-with-seleniumbase-e3ead6aebe7f\n\nhttps://github.com/ultrafunkamsterdam/undetected-chromedriver\n\n",
5+
"markdown": "---\ntitle: \"web scrap with advance\"\nauthor: \"Tony Duan\"\n\nexecute:\n warning: false\n error: false\n eval: false\n\nformat:\n html:\n toc: true\n toc-location: right\n code-fold: show\n code-tools: true\n number-sections: true\n code-block-bg: true\n code-block-border-left: \"#31BAE9\"\n---\n\n::: {#f4664121 .cell execution_count=1}\n``` {.python .cell-code}\nfrom seleniumbase import SB\n\nwith SB(test=True, uc=True) as sb:\n sb.open(\"https://google.com/ncr\")\n sb.type('[title=\"Search\"]', \"SeleniumBase GitHub page\\n\")\n sb.click('[href*=\"github.com/seleniumbase/\"]')\n sb.save_screenshot_to_logs() # ./latest_logs/\n print(sb.get_page_title())\n```\n:::\n\n\n::: {#27ca28e0 .cell execution_count=2}\n``` {.python .cell-code}\nfrom seleniumbase import SB\n\nwith SB(test=True, uc=True) as sb:\n sb.open(\"https://www.whiskybase.com/whiskies/\")\n #sb.type('[title=\"Search\"]', \"SeleniumBase GitHub page\\n\")\n #sb.click('[href*=\"github.com/seleniumbase/\"]')\n sb.save_screenshot_to_logs() # ./latest_logs/\n print(sb.get_page_title())\n```\n:::\n\n\n::: {#b67fbc70 .cell execution_count=3}\n``` {.python .cell-code}\nfrom seleniumbase import Driver\nclass Scraper(BaseCase):\ndef test_bypass_bot_protection(self):\ndriver = Driver(uc=True)\ndriver.open(\"https://www.whiskybase.com/whiskies/whisky/268484/2009-ud\")\ndriver.uc_gui_click_captcha()\npage_html = driver.get_page_source()\nprint(page_html)\ndriver.quit()\n```\n:::\n\n\n::: {#9e078a79 .cell execution_count=4}\n``` {.python .cell-code}\nfrom bs4 import BeautifulSoup\nsoup = BeautifulSoup(page_html, 'html.parser')\nprint(soup.prettify())\n```\n:::\n\n\n::: {#d632b64c .cell execution_count=5}\n``` {.python .cell-code}\nprint(soup.get_text())\n```\n:::\n\n\n::: {#203c4ca5 .cell execution_count=6}\n``` {.python .cell-code}\nsoup.select(\".votes-rating-current\").text\n```\n:::\n\n\nh1\n\n# Reference:\n\nhttps://github.com/seleniumbase/SeleniumBase\n\nhttps://medium.com/@datajournal/web-scraping-with-seleniumbase-e3ead6aebe7f\n\nhttps://github.com/ultrafunkamsterdam/undetected-chromedriver\n\n",
66
"supporting": [
77
"3 web scrap advance_files"
88
],

docs/Publish/5 git/git.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@
272272
<span class="dropdown-text">1 google analytics</span></a>
273273
</li>
274274
<li>
275-
<a class="dropdown-item" href="../../other/2 web scrap/web scrap.html">
275+
<a class="dropdown-item" href="../../other/2 web scrap/1 web scrap with rvest.html">
276276
<span class="dropdown-text">2 web scrap</span></a>
277277
</li>
278278
<li>

docs/other/2 web scrap/3 web scrap advance.html

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ <h2 id="toc-title">On this page</h2>
429429
</header>
430430

431431

432-
<div id="920f4433" class="cell" data-execution_count="1">
432+
<div id="f4664121" class="cell" data-execution_count="1">
433433
<details open="" class="code-fold">
434434
<summary>Code</summary>
435435
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> seleniumbase <span class="im">import</span> SB</span>
@@ -442,7 +442,7 @@ <h2 id="toc-title">On this page</h2>
442442
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(sb.get_page_title())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
443443
</details>
444444
</div>
445-
<div id="f412b7cd" class="cell" data-execution_count="2">
445+
<div id="27ca28e0" class="cell" data-execution_count="2">
446446
<details open="" class="code-fold">
447447
<summary>Code</summary>
448448
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> seleniumbase <span class="im">import</span> SB</span>
@@ -455,7 +455,7 @@ <h2 id="toc-title">On this page</h2>
455455
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(sb.get_page_title())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
456456
</details>
457457
</div>
458-
<div id="72338001" class="cell" data-execution_count="3">
458+
<div id="b67fbc70" class="cell" data-execution_count="3">
459459
<details open="" class="code-fold">
460460
<summary>Code</summary>
461461
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> seleniumbase <span class="im">import</span> Driver</span>
@@ -469,21 +469,21 @@ <h2 id="toc-title">On this page</h2>
469469
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>driver.quit()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
470470
</details>
471471
</div>
472-
<div id="c5151c77" class="cell" data-execution_count="4">
472+
<div id="9e078a79" class="cell" data-execution_count="4">
473473
<details open="" class="code-fold">
474474
<summary>Code</summary>
475475
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> bs4 <span class="im">import</span> BeautifulSoup</span>
476476
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>soup <span class="op">=</span> BeautifulSoup(page_html, <span class="st">'html.parser'</span>)</span>
477477
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(soup.prettify())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
478478
</details>
479479
</div>
480-
<div id="ac55c2eb" class="cell" data-execution_count="5">
480+
<div id="d632b64c" class="cell" data-execution_count="5">
481481
<details open="" class="code-fold">
482482
<summary>Code</summary>
483483
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(soup.get_text())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
484484
</details>
485485
</div>
486-
<div id="326d7d98" class="cell" data-execution_count="6">
486+
<div id="203c4ca5" class="cell" data-execution_count="6">
487487
<details open="" class="code-fold">
488488
<summary>Code</summary>
489489
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>soup.select(<span class="st">".votes-rating-current"</span>).text</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>

docs/other/6 other/1 chromote.html

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@
270270
<span class="dropdown-text">1 google analytics</span></a>
271271
</li>
272272
<li>
273-
<a class="dropdown-item" href="../../other/2 web scrap/web scrap.html">
273+
<a class="dropdown-item" href="../../other/2 web scrap/1 web scrap with rvest.html">
274274
<span class="dropdown-text">2 web scrap</span></a>
275275
</li>
276276
<li>
@@ -344,8 +344,20 @@
344344
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth2 show">
345345
<li class="sidebar-item">
346346
<div class="sidebar-item-container">
347-
<a href="../../other/2 web scrap/web scrap.html" class="sidebar-item-text sidebar-link">
348-
<span class="menu-text">web scrap</span></a>
347+
<a href="../../other/2 web scrap/1 web scrap with rvest.html" class="sidebar-item-text sidebar-link">
348+
<span class="menu-text">web scrap with rvest</span></a>
349+
</div>
350+
</li>
351+
<li class="sidebar-item">
352+
<div class="sidebar-item-container">
353+
<a href="../../other/2 web scrap/2 web scrap with chromote.html" class="sidebar-item-text sidebar-link">
354+
<span class="menu-text">web scrap with chromote</span></a>
355+
</div>
356+
</li>
357+
<li class="sidebar-item">
358+
<div class="sidebar-item-container">
359+
<a href="../../other/2 web scrap/3 web scrap advance.html" class="sidebar-item-text sidebar-link">
360+
<span class="menu-text">web scrap with advance</span></a>
349361
</div>
350362
</li>
351363
</ul>

0 commit comments

Comments
 (0)