Building a FineWeb-style dataset for LLM pretraining

LLMs
data
Python
Author

Weyland Joyner

Published

April 4, 2026

Part I. The Web Crawler

Let’s start by building a web crawler that returns a sitemap, then we’ll use trafilatura to deal with retrieving and deduplicating content.

We’ll have three different versions: the first, simplest version will use a Python set and a queue (from collections import deque) for the visited set and “priority queue” respectively and we won’t use asyncio yet. Version two will use a bloom filter and deal with robots.txt, I may also use a more sophisticated priority queue. Finally version three will be async with asyncio.

# !pip install pybloom-live
from bs4 import BeautifulSoup
from collections import deque
# from pybloom_live import ScalableBloomFilter
import requests
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
class WebCrawler_v1:
    def __init__(self, startUrl: str):
        self.startUrl = startUrl

    def _is_crawlable(self, url: str) -> bool:
        parsed = urlparse(url)

        return parsed.scheme in ("http", "https") and bool(parsed.netloc)

    def _extract_links(self, url: str) -> list[str]:
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")

            links = []
            for tag in soup.find_all("a", href=True):
                href = tag["href"]
                absolute = urljoin(url, href)
                if self._is_crawlable(absolute):
                    links.append(absolute)

            return links
        except:
            return []

    def crawl(self, limit: int = 100) -> dict[str, list[str]]:
        queue = deque()
        visited = set([self.startUrl])
        sitemap = {}
        count = 0

        queue.append(self.startUrl)

        while queue and count <= limit:
            # pull the first item from the queue, add it to the sitemap,
            # extract links and append those to the queue
            # also iterate count
            url = queue.popleft()
            links = self._extract_links(url)
            sitemap[url] = links
            for link in links:
                if link not in visited:
                    queue.append(link)
                    visited.add(link)

            count += 1

        return sitemap
wc = WebCrawler_v1('https://pymotw.com/3/urllib.parse/index.html')
results = wc.crawl(limit = 10)
/tmp/ipykernel_21260/16322375.py:15: XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.

If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that, run this code before calling the BeautifulSoup constructor:

    from bs4 import XMLParsedAsHTMLWarning
    import warnings

    warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

  soup = BeautifulSoup(response.text, "html.parser")
print(results)
{'https://pymotw.com/3/urllib.parse/index.html': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#id1', 'https://pymotw.com/3/urllib.parse/index.html#id2', 'https://tools.ietf.org/html/rfc2396.html', 'https://pymotw.com/3/urllib.parse/index.html#id3', 'https://pymotw.com/3/urllib.parse/index.html#id4', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#id5', 'https://pymotw.com/3/urllib.parse/index.html#id6', 'https://pymotw.com/3/urllib.parse/index.html#id7', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#id8', 'https://pymotw.com/3/urllib.parse/index.html#id9', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/urllib.parse/index.html#id10', 'https://pymotw.com/3/urllib.parse/index.html#id11', 'https://pymotw.com/3/urllib.parse/index.html#id12', 'https://pymotw.com/3/urllib.parse/index.html#id13', 'https://pymotw.com/3/urllib.parse/index.html#id14', 'https://docs.python.org/3.7/library/urllib.parse.html', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://tools.ietf.org/html/rfc1738.html', 'https://tools.ietf.org/html/rfc1808.html', 'https://tools.ietf.org/html/rfc2396.html', 'https://tools.ietf.org/html/rfc3986.html', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/2/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/index.html': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/index.html#python-3-module-of-the-week', 'http://doughellmann.com/', 'http://www.python.org/', 'http://pymotw.com/2/', 'https://pymotw.com/3/about.html', 'https://pymotw.com/3/text.html', 'https://pymotw.com/3/string/index.html', 'https://pymotw.com/3/textwrap/index.html', 'https://pymotw.com/3/re/index.html', 'https://pymotw.com/3/difflib/index.html', 'https://pymotw.com/3/data_structures.html', 'https://pymotw.com/3/enum/index.html', 'https://pymotw.com/3/collections/index.html', 'https://pymotw.com/3/array/index.html', 'https://pymotw.com/3/heapq/index.html', 'https://pymotw.com/3/bisect/index.html', 'https://pymotw.com/3/queue/index.html', 'https://pymotw.com/3/struct/index.html', 'https://pymotw.com/3/weakref/index.html', 'https://pymotw.com/3/copy/index.html', 'https://pymotw.com/3/pprint/index.html', 'https://pymotw.com/3/algorithm_tools.html', 'https://pymotw.com/3/functools/index.html', 'https://pymotw.com/3/itertools/index.html', 'https://pymotw.com/3/operator/index.html', 'https://pymotw.com/3/contextlib/index.html', 'https://pymotw.com/3/dates.html', 'https://pymotw.com/3/time/index.html', 'https://pymotw.com/3/datetime/index.html', 'https://pymotw.com/3/calendar/index.html', 'https://pymotw.com/3/numeric.html', 'https://pymotw.com/3/decimal/index.html', 'https://pymotw.com/3/fractions/index.html', 'https://pymotw.com/3/random/index.html', 'https://pymotw.com/3/math/index.html', 'https://pymotw.com/3/statistics/index.html', 'https://pymotw.com/3/file_access.html', 'https://pymotw.com/3/os.path/index.html', 'https://pymotw.com/3/pathlib/index.html', 'https://pymotw.com/3/glob/index.html', 'https://pymotw.com/3/fnmatch/index.html', 'https://pymotw.com/3/linecache/index.html', 'https://pymotw.com/3/tempfile/index.html', 'https://pymotw.com/3/shutil/index.html', 'https://pymotw.com/3/filecmp/index.html', 'https://pymotw.com/3/mmap/index.html', 'https://pymotw.com/3/codecs/index.html', 'https://pymotw.com/3/io/index.html', 'https://pymotw.com/3/persistence.html', 'https://pymotw.com/3/pickle/index.html', 'https://pymotw.com/3/shelve/index.html', 'https://pymotw.com/3/dbm/index.html', 'https://pymotw.com/3/sqlite3/index.html', 'https://pymotw.com/3/xml.etree.ElementTree/index.html', 'https://pymotw.com/3/csv/index.html', 'https://pymotw.com/3/compression.html', 'https://pymotw.com/3/zlib/index.html', 'https://pymotw.com/3/gzip/index.html', 'https://pymotw.com/3/bz2/index.html', 'https://pymotw.com/3/tarfile/index.html', 'https://pymotw.com/3/zipfile/index.html', 'https://pymotw.com/3/cryptographic.html', 'https://pymotw.com/3/hashlib/index.html', 'https://pymotw.com/3/hmac/index.html', 'https://pymotw.com/3/concurrency.html', 'https://pymotw.com/3/subprocess/index.html', 'https://pymotw.com/3/signal/index.html', 'https://pymotw.com/3/threading/index.html', 'https://pymotw.com/3/multiprocessing/index.html', 'https://pymotw.com/3/asyncio/index.html', 'https://pymotw.com/3/concurrent.futures/index.html', 'https://pymotw.com/3/networking.html', 'https://pymotw.com/3/ipaddress/index.html', 'https://pymotw.com/3/socket/index.html', 'https://pymotw.com/3/selectors/index.html', 'https://pymotw.com/3/select/index.html', 'https://pymotw.com/3/socketserver/index.html', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.parse/index.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.robotparser/index.html', 'https://pymotw.com/3/base64/index.html', 'https://pymotw.com/3/http.server/index.html', 'https://pymotw.com/3/http.cookies/index.html', 'https://pymotw.com/3/webbrowser/index.html', 'https://pymotw.com/3/uuid/index.html', 'https://pymotw.com/3/json/index.html', 'https://pymotw.com/3/xmlrpc.client/index.html', 'https://pymotw.com/3/xmlrpc.server/index.html', 'https://pymotw.com/3/email.html', 'https://pymotw.com/3/smtplib/index.html', 'https://pymotw.com/3/smtpd/index.html', 'https://pymotw.com/3/mailbox/index.html', 'https://pymotw.com/3/imaplib/index.html', 'https://pymotw.com/3/frameworks.html', 'https://pymotw.com/3/argparse/index.html', 'https://pymotw.com/3/getopt/index.html', 'https://pymotw.com/3/readline/index.html', 'https://pymotw.com/3/getpass/index.html', 'https://pymotw.com/3/cmd/index.html', 'https://pymotw.com/3/shlex/index.html', 'https://pymotw.com/3/configparser/index.html', 'https://pymotw.com/3/logging/index.html', 'https://pymotw.com/3/fileinput/index.html', 'https://pymotw.com/3/atexit/index.html', 'https://pymotw.com/3/sched/index.html', 'https://pymotw.com/3/i18n.html', 'https://pymotw.com/3/gettext/index.html', 'https://pymotw.com/3/locale/index.html', 'https://pymotw.com/3/dev_tools.html', 'https://pymotw.com/3/pydoc/index.html', 'https://pymotw.com/3/doctest/index.html', 'https://pymotw.com/3/unittest/index.html', 'https://pymotw.com/3/trace/index.html', 'https://pymotw.com/3/traceback/index.html', 'https://pymotw.com/3/cgitb/index.html', 'https://pymotw.com/3/pdb/index.html', 'https://pymotw.com/3/profile/index.html', 'https://pymotw.com/3/timeit/index.html', 'https://pymotw.com/3/tabnanny/index.html', 'https://pymotw.com/3/compileall/index.html', 'https://pymotw.com/3/pyclbr/index.html', 'https://pymotw.com/3/venv/index.html', 'https://pymotw.com/3/ensurepip/index.html', 'https://pymotw.com/3/runtime_services.html', 'https://pymotw.com/3/site/index.html', 'https://pymotw.com/3/sys/index.html', 'https://pymotw.com/3/os/index.html', 'https://pymotw.com/3/platform/index.html', 'https://pymotw.com/3/resource/index.html', 'https://pymotw.com/3/gc/index.html', 'https://pymotw.com/3/sysconfig/index.html', 'https://pymotw.com/3/language.html', 'https://pymotw.com/3/warnings/index.html', 'https://pymotw.com/3/abc/index.html', 'https://pymotw.com/3/dis/index.html', 'https://pymotw.com/3/inspect/index.html', 'https://pymotw.com/3/importing.html', 'https://pymotw.com/3/importlib/index.html', 'https://pymotw.com/3/pkgutil/index.html', 'https://pymotw.com/3/zipimport/index.html', 'https://pymotw.com/3/unix.html', 'https://pymotw.com/3/pwd/index.html', 'https://pymotw.com/3/grp/index.html', 'https://pymotw.com/3/porting_notes.html', 'https://pymotw.com/3/porting_notes.html#references', 'https://pymotw.com/3/porting_notes.html#new-modules', 'https://pymotw.com/3/porting_notes.html#renamed-modules', 'https://pymotw.com/3/porting_notes.html#removed-modules', 'https://pymotw.com/3/porting_notes.html#deprecated-modules', 'https://pymotw.com/3/porting_notes.html#summary-of-changes-to-modules', 'https://pymotw.com/3/third_party.html', 'https://pymotw.com/3/third_party.html#text', 'https://pymotw.com/3/third_party.html#algorithms', 'https://pymotw.com/3/third_party.html#dates-and-times', 'https://pymotw.com/3/third_party.html#mathematics', 'https://pymotw.com/3/third_party.html#data-persistence-and-exchange', 'https://pymotw.com/3/third_party.html#cryptography', 'https://pymotw.com/3/third_party.html#concurrency-with-processes-threads-and-coroutines', 'https://pymotw.com/3/third_party.html#the-internet', 'https://pymotw.com/3/third_party.html#email', 'https://pymotw.com/3/third_party.html#application-building-blocks', 'https://pymotw.com/3/third_party.html#developer-tools', 'https://pymotw.com/3/about.html', 'https://pymotw.com/3/about.html#subscribe', 'https://pymotw.com/3/about.html#tools', 'https://pymotw.com/3/about.html#translations-and-other-versions', 'https://pymotw.com/3/about.html#copyright-and-licensing', 'https://pymotw.com/3/text.html', 'http://feeds.doughellmann.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://twitter.com/pymotw', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/py-modindex.html': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/py-modindex.html#cap-a', 'https://pymotw.com/3/py-modindex.html#cap-b', 'https://pymotw.com/3/py-modindex.html#cap-c', 'https://pymotw.com/3/py-modindex.html#cap-d', 'https://pymotw.com/3/py-modindex.html#cap-e', 'https://pymotw.com/3/py-modindex.html#cap-f', 'https://pymotw.com/3/py-modindex.html#cap-g', 'https://pymotw.com/3/py-modindex.html#cap-h', 'https://pymotw.com/3/py-modindex.html#cap-i', 'https://pymotw.com/3/py-modindex.html#cap-j', 'https://pymotw.com/3/py-modindex.html#cap-l', 'https://pymotw.com/3/py-modindex.html#cap-m', 'https://pymotw.com/3/py-modindex.html#cap-o', 'https://pymotw.com/3/py-modindex.html#cap-p', 'https://pymotw.com/3/py-modindex.html#cap-q', 'https://pymotw.com/3/py-modindex.html#cap-r', 'https://pymotw.com/3/py-modindex.html#cap-s', 'https://pymotw.com/3/py-modindex.html#cap-t', 'https://pymotw.com/3/py-modindex.html#cap-u', 'https://pymotw.com/3/py-modindex.html#cap-v', 'https://pymotw.com/3/py-modindex.html#cap-w', 'https://pymotw.com/3/py-modindex.html#cap-x', 'https://pymotw.com/3/py-modindex.html#cap-z', 'https://pymotw.com/3/abc/index.html#module-abc', 'https://pymotw.com/3/argparse/index.html#module-argparse', 'https://pymotw.com/3/array/index.html#module-array', 'https://pymotw.com/3/asyncio/index.html#module-asyncio', 'https://pymotw.com/3/atexit/index.html#module-atexit', 'https://pymotw.com/3/base64/index.html#module-base64', 'https://pymotw.com/3/bisect/index.html#module-bisect', 'https://pymotw.com/3/bz2/index.html#module-bz2', 'https://pymotw.com/3/calendar/index.html#module-calendar', 'https://pymotw.com/3/cgitb/index.html#module-cgitb', 'https://pymotw.com/3/cmd/index.html#module-cmd', 'https://pymotw.com/3/codecs/index.html#module-codecs', 'https://pymotw.com/3/collections/index.html#module-collections', 'https://pymotw.com/3/collections/abc.html#module-collections.abc', 'https://pymotw.com/3/compileall/index.html#module-compileall', 'https://pymotw.com/3/concurrent.futures/index.html#module-concurrent.futures', 'https://pymotw.com/3/configparser/index.html#module-configparser', 'https://pymotw.com/3/contextlib/index.html#module-contextlib', 'https://pymotw.com/3/copy/index.html#module-copy', 'https://pymotw.com/3/csv/index.html#module-csv', 'https://pymotw.com/3/datetime/index.html#module-datetime', 'https://pymotw.com/3/dbm/index.html#module-dbm', 'https://pymotw.com/3/decimal/index.html#module-decimal', 'https://pymotw.com/3/difflib/index.html#module-difflib', 'https://pymotw.com/3/dis/index.html#module-dis', 'https://pymotw.com/3/doctest/index.html#module-doctest', 'https://pymotw.com/3/ensurepip/index.html#module-ensurepip', 'https://pymotw.com/3/enum/index.html#module-enum', 'https://pymotw.com/3/filecmp/index.html#module-filecmp', 'https://pymotw.com/3/fileinput/index.html#module-fileinput', 'https://pymotw.com/3/fnmatch/index.html#module-fnmatch', 'https://pymotw.com/3/fractions/index.html#module-fractions', 'https://pymotw.com/3/functools/index.html#module-functools', 'https://pymotw.com/3/gc/index.html#module-gc', 'https://pymotw.com/3/getopt/index.html#module-getopt', 'https://pymotw.com/3/getpass/index.html#module-getpass', 'https://pymotw.com/3/gettext/index.html#module-gettext', 'https://pymotw.com/3/glob/index.html#module-glob', 'https://pymotw.com/3/grp/index.html#module-grp', 'https://pymotw.com/3/gzip/index.html#module-gzip', 'https://pymotw.com/3/hashlib/index.html#module-hashlib', 'https://pymotw.com/3/heapq/index.html#module-heapq', 'https://pymotw.com/3/hmac/index.html#module-hmac', 'https://pymotw.com/3/http.cookies/index.html#module-http.cookies', 'https://pymotw.com/3/http.server/index.html#module-http.server', 'https://pymotw.com/3/imaplib/index.html#module-imaplib', 'https://pymotw.com/3/importlib/index.html#module-importlib', 'https://pymotw.com/3/inspect/index.html#module-inspect', 'https://pymotw.com/3/io/index.html#module-io', 'https://pymotw.com/3/ipaddress/index.html#module-ipaddress', 'https://pymotw.com/3/itertools/index.html#module-itertools', 'https://pymotw.com/3/json/index.html#module-json', 'https://pymotw.com/3/linecache/index.html#module-linecache', 'https://pymotw.com/3/locale/index.html#module-locale', 'https://pymotw.com/3/logging/index.html#module-logging', 'https://pymotw.com/3/mailbox/index.html#module-mailbox', 'https://pymotw.com/3/math/index.html#module-math', 'https://pymotw.com/3/mmap/index.html#module-mmap', 'https://pymotw.com/3/multiprocessing/index.html#module-multiprocessing', 'https://pymotw.com/3/operator/index.html#module-operator', 'https://pymotw.com/3/os/index.html#module-os', 'https://pymotw.com/3/os.path/index.html#module-os.path', 'https://pymotw.com/3/pathlib/index.html#module-pathlib', 'https://pymotw.com/3/pdb/index.html#module-pdb', 'https://pymotw.com/3/pickle/index.html#module-pickle', 'https://pymotw.com/3/pkgutil/index.html#module-pkgutil', 'https://pymotw.com/3/platform/index.html#module-platform', 'https://pymotw.com/3/pprint/index.html#module-pprint', 'https://pymotw.com/3/profile/index.html#module-profile', 'https://pymotw.com/3/profile/index.html#module-pstats', 'https://pymotw.com/3/pwd/index.html#module-pwd', 'https://pymotw.com/3/pyclbr/index.html#module-pyclbr', 'https://pymotw.com/3/pydoc/index.html#module-pydoc', 'https://pymotw.com/3/queue/index.html#module-queue', 'https://pymotw.com/3/random/index.html#module-random', 'https://pymotw.com/3/re/index.html#module-re', 'https://pymotw.com/3/readline/index.html#module-readline', 'https://pymotw.com/3/resource/index.html#module-resource', 'https://pymotw.com/3/sched/index.html#module-sched', 'https://pymotw.com/3/select/index.html#module-select', 'https://pymotw.com/3/selectors/index.html#module-selectors', 'https://pymotw.com/3/shelve/index.html#module-shelve', 'https://pymotw.com/3/shlex/index.html#module-shlex', 'https://pymotw.com/3/shutil/index.html#module-shutil', 'https://pymotw.com/3/signal/index.html#module-signal', 'https://pymotw.com/3/site/index.html#module-site', 'https://pymotw.com/3/site/index.html#module-sitecustomize', 'https://pymotw.com/3/smtpd/index.html#module-smtpd', 'https://pymotw.com/3/smtplib/index.html#module-smtplib', 'https://pymotw.com/3/socket/index.html#module-socket', 'https://pymotw.com/3/socketserver/index.html#module-socketserver', 'https://pymotw.com/3/sqlite3/index.html#module-sqlite3', 'https://pymotw.com/3/statistics/index.html#module-statistics', 'https://pymotw.com/3/string/index.html#module-string', 'https://pymotw.com/3/struct/index.html#module-struct', 'https://pymotw.com/3/subprocess/index.html#module-subprocess', 'https://pymotw.com/3/sys/index.html#module-sys', 'https://pymotw.com/3/sysconfig/index.html#module-sysconfig', 'https://pymotw.com/3/tabnanny/index.html#module-tabnanny', 'https://pymotw.com/3/tarfile/index.html#module-tarfile', 'https://pymotw.com/3/tempfile/index.html#module-tempfile', 'https://pymotw.com/3/textwrap/index.html#module-textwrap', 'https://pymotw.com/3/threading/index.html#module-threading', 'https://pymotw.com/3/time/index.html#module-time', 'https://pymotw.com/3/timeit/index.html#module-timeit', 'https://pymotw.com/3/trace/index.html#module-trace', 'https://pymotw.com/3/traceback/index.html#module-traceback', 'https://pymotw.com/3/unittest/index.html#module-unittest', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://pymotw.com/3/urllib.robotparser/index.html#module-urllib.robotparser', 'https://pymotw.com/3/site/index.html#module-usercustomize', 'https://pymotw.com/3/uuid/index.html#module-uuid', 'https://pymotw.com/3/venv/index.html#module-venv', 'https://pymotw.com/3/warnings/index.html#module-warnings', 'https://pymotw.com/3/weakref/index.html#module-weakref', 'https://pymotw.com/3/webbrowser/index.html#module-webbrowser', 'https://pymotw.com/3/xml.etree.ElementTree/index.html#module-xml.etree.ElementTree', 'https://pymotw.com/3/xmlrpc.client/index.html#module-xmlrpc.client', 'https://pymotw.com/3/xmlrpc.server/index.html#module-xmlrpc.server', 'https://pymotw.com/3/zipfile/index.html#module-zipfile', 'https://pymotw.com/3/zipimport/index.html#module-zipimport', 'https://pymotw.com/3/zlib/index.html#module-zlib', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/genindex.html': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/genindex.html#A', 'https://pymotw.com/3/genindex.html#B', 'https://pymotw.com/3/genindex.html#C', 'https://pymotw.com/3/genindex.html#D', 'https://pymotw.com/3/genindex.html#E', 'https://pymotw.com/3/genindex.html#F', 'https://pymotw.com/3/genindex.html#G', 'https://pymotw.com/3/genindex.html#H', 'https://pymotw.com/3/genindex.html#I', 'https://pymotw.com/3/genindex.html#J', 'https://pymotw.com/3/genindex.html#L', 'https://pymotw.com/3/genindex.html#M', 'https://pymotw.com/3/genindex.html#O', 'https://pymotw.com/3/genindex.html#P', 'https://pymotw.com/3/genindex.html#Q', 'https://pymotw.com/3/genindex.html#R', 'https://pymotw.com/3/genindex.html#S', 'https://pymotw.com/3/genindex.html#T', 'https://pymotw.com/3/genindex.html#U', 'https://pymotw.com/3/genindex.html#V', 'https://pymotw.com/3/genindex.html#W', 'https://pymotw.com/3/genindex.html#X', 'https://pymotw.com/3/genindex.html#Z', 'https://pymotw.com/3/porting_notes.html#index-22', 'https://pymotw.com/3/abc/index.html#module-abc', 'https://pymotw.com/3/porting_notes.html#index-23', 'https://pymotw.com/3/porting_notes.html#index-20', 'https://pymotw.com/3/porting_notes.html#index-24', 'https://pymotw.com/3/argparse/index.html#module-argparse', 'https://pymotw.com/3/porting_notes.html#index-25', 'https://pymotw.com/3/array/index.html#module-array', 'https://pymotw.com/3/porting_notes.html#index-4', 'https://pymotw.com/3/porting_notes.html#index-17', 'https://pymotw.com/3/asyncio/index.html#module-asyncio', 'https://pymotw.com/3/porting_notes.html#index-17', 'https://pymotw.com/3/porting_notes.html#index-26', 'https://pymotw.com/3/atexit/index.html#module-atexit', 'https://pymotw.com/3/porting_notes.html#index-27', 'https://pymotw.com/3/base64/index.html#module-base64', 'https://pymotw.com/3/bisect/index.html#module-bisect', 'https://pymotw.com/3/porting_notes.html#index-28', 'https://pymotw.com/3/bz2/index.html#module-bz2', 'https://pymotw.com/3/calendar/index.html#module-calendar', 'https://pymotw.com/3/atexit/index.html#index-0', 'https://pymotw.com/3/cgitb/index.html#module-cgitb', 'https://pymotw.com/3/cmd/index.html#module-cmd', 'https://pymotw.com/3/codecs/index.html#module-codecs', 'https://pymotw.com/3/porting_notes.html#index-29', 'https://pymotw.com/3/collections/index.html#module-collections', 'https://pymotw.com/3/collections/abc.html#module-collections.abc', 'https://pymotw.com/3/porting_notes.html#index-30', 'https://pymotw.com/3/compileall/index.html#module-compileall', 'https://pymotw.com/3/concurrent.futures/index.html#module-concurrent.futures', 'https://pymotw.com/3/porting_notes.html#index-31', 'https://pymotw.com/3/configparser/index.html#module-configparser', 'https://pymotw.com/3/porting_notes.html#index-32', 'https://pymotw.com/3/contextlib/index.html#module-contextlib', 'https://pymotw.com/3/copy/index.html#module-copy', 'https://pymotw.com/3/porting_notes.html#index-33', 'https://pymotw.com/3/csv/index.html#module-csv', 'https://pymotw.com/3/porting_notes.html#index-34', 'https://pymotw.com/3/datetime/index.html#module-datetime', 'https://pymotw.com/3/porting_notes.html#index-23', 'https://pymotw.com/3/dbm/index.html#module-dbm', 'https://pymotw.com/3/porting_notes.html#index-35', 'https://pymotw.com/3/decimal/index.html#module-decimal', 'https://pymotw.com/3/atexit/index.html#index-1', 'https://pymotw.com/3/difflib/index.html#module-difflib', 'https://pymotw.com/3/dis/index.html#module-dis', 'https://pymotw.com/3/doctest/index.html#module-doctest', 'https://pymotw.com/3/porting_notes.html#index-8', 'https://pymotw.com/3/ensurepip/index.html#module-ensurepip', 'https://pymotw.com/3/enum/index.html#module-enum', 'https://pymotw.com/3/atexit/index.html#index-0', 'https://pymotw.com/3/filecmp/index.html#module-filecmp', 'https://pymotw.com/3/fileinput/index.html#module-fileinput', 'https://pymotw.com/3/fnmatch/index.html#module-fnmatch', 'https://pymotw.com/3/porting_notes.html#index-18', 'https://pymotw.com/3/porting_notes.html#index-36', 'https://pymotw.com/3/fractions/index.html#module-fractions', 'https://pymotw.com/3/functools/index.html#module-functools', 'https://pymotw.com/3/porting_notes.html#index-37', 'https://pymotw.com/3/gc/index.html#module-gc', 'https://pymotw.com/3/getopt/index.html#module-getopt', 'https://pymotw.com/3/getpass/index.html#module-getpass', 'https://pymotw.com/3/porting_notes.html#index-38', 'https://pymotw.com/3/gettext/index.html#module-gettext', 'https://pymotw.com/3/porting_notes.html#index-39', 'https://pymotw.com/3/glob/index.html#module-glob', 'https://pymotw.com/3/grp/index.html#module-grp', 'https://pymotw.com/3/gzip/index.html#module-gzip', 'https://pymotw.com/3/porting_notes.html#index-11', 'https://pymotw.com/3/porting_notes.html#index-7', 'https://pymotw.com/3/hashlib/index.html#module-hashlib', 'https://pymotw.com/3/heapq/index.html#module-heapq', 'https://pymotw.com/3/hmac/index.html#module-hmac', 'https://pymotw.com/3/porting_notes.html#index-6', 'https://pymotw.com/3/porting_notes.html#index-40', 'https://pymotw.com/3/http.cookies/index.html#module-http.cookies', 'https://pymotw.com/3/http.server/index.html#module-http.server', 'https://pymotw.com/3/porting_notes.html#index-41', 'https://pymotw.com/3/imaplib/index.html#module-imaplib', 'https://pymotw.com/3/porting_notes.html#index-19', 'https://pymotw.com/3/porting_notes.html#index-19', 'https://pymotw.com/3/importlib/index.html#module-importlib', 'https://pymotw.com/3/porting_notes.html#index-42', 'https://pymotw.com/3/inspect/index.html#module-inspect', 'https://pymotw.com/3/porting_notes.html#index-10', 'https://pymotw.com/3/io/index.html#module-io', 'https://pymotw.com/3/ipaddress/index.html#module-ipaddress', 'https://pymotw.com/3/porting_notes.html#index-43', 'https://pymotw.com/3/itertools/index.html#module-itertools', 'https://pymotw.com/3/porting_notes.html#index-44', 'https://pymotw.com/3/json/index.html#module-json', 'https://pymotw.com/3/linecache/index.html#module-linecache', 'https://pymotw.com/3/porting_notes.html#index-45', 'https://pymotw.com/3/locale/index.html#module-locale', 'https://pymotw.com/3/porting_notes.html#index-46', 'https://pymotw.com/3/logging/index.html#module-logging', 'https://pymotw.com/3/porting_notes.html#index-47', 'https://pymotw.com/3/mailbox/index.html#module-mailbox', 'https://pymotw.com/3/math/index.html#module-math', 'https://pymotw.com/3/porting_notes.html#index-48', 'https://pymotw.com/3/mmap/index.html#module-mmap', 'https://pymotw.com/3/multiprocessing/index.html#module-multiprocessing', 'https://pymotw.com/3/porting_notes.html#index-49', 'https://pymotw.com/3/operator/index.html#module-operator', 'https://pymotw.com/3/porting_notes.html#index-20', 'https://pymotw.com/3/porting_notes.html#index-13', 'https://pymotw.com/3/porting_notes.html#index-50', 'https://pymotw.com/3/os/index.html#module-os', 'https://pymotw.com/3/porting_notes.html#index-51', 'https://pymotw.com/3/os.path/index.html#module-os.path', 'https://pymotw.com/3/pathlib/index.html#module-pathlib', 'https://pymotw.com/3/porting_notes.html#index-52', 'https://pymotw.com/3/pdb/index.html#module-pdb', 'https://pymotw.com/3/porting_notes.html#index-53', 'https://pymotw.com/3/pickle/index.html#module-pickle', 'https://pymotw.com/3/porting_notes.html#index-54', 'https://pymotw.com/3/pkgutil/index.html#module-pkgutil', 'https://pymotw.com/3/porting_notes.html#index-55', 'https://pymotw.com/3/platform/index.html#module-platform', 'https://pymotw.com/3/porting_notes.html#index-72', 'https://pymotw.com/3/porting_notes.html#index-72', 'https://pymotw.com/3/porting_notes.html#index-72', 'https://pymotw.com/3/porting_notes.html#index-22', 'https://pymotw.com/3/porting_notes.html#index-23', 'https://pymotw.com/3/porting_notes.html#index-20', 'https://pymotw.com/3/porting_notes.html#index-24', 'https://pymotw.com/3/porting_notes.html#index-25', 'https://pymotw.com/3/porting_notes.html#index-4', 'https://pymotw.com/3/porting_notes.html#index-17', 'https://pymotw.com/3/porting_notes.html#index-17', 'https://pymotw.com/3/porting_notes.html#index-26', 'https://pymotw.com/3/porting_notes.html#index-27', 'https://pymotw.com/3/porting_notes.html#index-28', 'https://pymotw.com/3/porting_notes.html#index-21', 'https://pymotw.com/3/porting_notes.html#index-29', 'https://pymotw.com/3/porting_notes.html#index-30', 'https://pymotw.com/3/porting_notes.html#index-31', 'https://pymotw.com/3/porting_notes.html#index-32', 'https://pymotw.com/3/porting_notes.html#index-33', 'https://pymotw.com/3/porting_notes.html#index-34', 'https://pymotw.com/3/porting_notes.html#index-23', 'https://pymotw.com/3/porting_notes.html#index-35', 'https://pymotw.com/3/porting_notes.html#index-16', 'https://pymotw.com/3/porting_notes.html#index-8', 'https://pymotw.com/3/porting_notes.html#index-18', 'https://pymotw.com/3/porting_notes.html#index-36', 'https://pymotw.com/3/porting_notes.html#index-37', 'https://pymotw.com/3/porting_notes.html#index-38', 'https://pymotw.com/3/porting_notes.html#index-39', 'https://pymotw.com/3/porting_notes.html#index-11', 'https://pymotw.com/3/porting_notes.html#index-7', 'https://pymotw.com/3/porting_notes.html#index-6', 'https://pymotw.com/3/porting_notes.html#index-40', 'https://pymotw.com/3/porting_notes.html#index-41', 'https://pymotw.com/3/porting_notes.html#index-19', 'https://pymotw.com/3/porting_notes.html#index-19', 'https://pymotw.com/3/porting_notes.html#index-42', 'https://pymotw.com/3/porting_notes.html#index-10', 'https://pymotw.com/3/porting_notes.html#index-43', 'https://pymotw.com/3/porting_notes.html#index-44', 'https://pymotw.com/3/porting_notes.html#index-45', 'https://pymotw.com/3/porting_notes.html#index-46', 'https://pymotw.com/3/porting_notes.html#index-47', 'https://pymotw.com/3/porting_notes.html#index-48', 'https://pymotw.com/3/porting_notes.html#index-49', 'https://pymotw.com/3/porting_notes.html#index-20', 'https://pymotw.com/3/porting_notes.html#index-13', 'https://pymotw.com/3/porting_notes.html#index-50', 'https://pymotw.com/3/porting_notes.html#index-51', 'https://pymotw.com/3/porting_notes.html#index-52', 'https://pymotw.com/3/porting_notes.html#index-53', 'https://pymotw.com/3/porting_notes.html#index-54', 'https://pymotw.com/3/porting_notes.html#index-55', 'https://pymotw.com/3/porting_notes.html#index-56', 'https://pymotw.com/3/porting_notes.html#index-12', 'https://pymotw.com/3/porting_notes.html#index-57', 'https://pymotw.com/3/porting_notes.html#index-2', 'https://pymotw.com/3/porting_notes.html#index-0', 'https://pymotw.com/3/porting_notes.html#index-58', 'https://pymotw.com/3/porting_notes.html#index-59', 'https://pymotw.com/3/porting_notes.html#index-15', 'https://pymotw.com/3/porting_notes.html#index-61', 'https://pymotw.com/3/porting_notes.html#index-62', 'https://pymotw.com/3/porting_notes.html#index-63', 'https://pymotw.com/3/porting_notes.html#index-64', 'https://pymotw.com/3/porting_notes.html#index-3', 'https://pymotw.com/3/porting_notes.html#index-30', 'https://pymotw.com/3/porting_notes.html#index-65', 'https://pymotw.com/3/porting_notes.html#index-9', 'https://pymotw.com/3/porting_notes.html#index-66', 'https://pymotw.com/3/porting_notes.html#index-68', 'https://pymotw.com/3/porting_notes.html#index-14', 'https://pymotw.com/3/porting_notes.html#index-68', 'https://pymotw.com/3/porting_notes.html#index-70', 'https://pymotw.com/3/porting_notes.html#index-5', 'https://pymotw.com/3/porting_notes.html#index-71', 'https://pymotw.com/3/porting_notes.html#index-73', 'https://pymotw.com/3/porting_notes.html#index-74', 'https://pymotw.com/3/porting_notes.html#index-75', 'https://pymotw.com/3/porting_notes.html#index-76', 'https://pymotw.com/3/pprint/index.html#module-pprint', 'https://pymotw.com/3/profile/index.html#module-profile', 'https://pymotw.com/3/profile/index.html#module-pstats', 'https://pymotw.com/3/pwd/index.html#module-pwd', 'https://pymotw.com/3/pyclbr/index.html#module-pyclbr', 'https://pymotw.com/3/pydoc/index.html#module-pydoc', 'https://pymotw.com/3/codecs/index.html#index-0', 'https://pymotw.com/3/weakref/index.html#index-0', 'https://pymotw.com/3/warnings/index.html#index-0', 'https://pymotw.com/3/warnings/index.html#index-1', 'https://pymotw.com/3/porting_notes.html#index-67', 'https://pymotw.com/3/sqlite3/index.html#index-0', 'https://pymotw.com/3/string/index.html#index-0', 'https://pymotw.com/3/string/index.html#index-1', 'https://pymotw.com/3/importlib/index.html#index-0', 'https://pymotw.com/3/pkgutil/index.html#index-0', 'https://pymotw.com/3/sys/imports.html#index-0', 'https://pymotw.com/3/sys/imports.html#index-1', 'https://pymotw.com/3/zipimport/index.html#index-0', 'https://pymotw.com/3/csv/index.html#index-0', 'https://pymotw.com/3/porting_notes.html#index-1', 'https://pymotw.com/3/abc/index.html#index-0', 'https://pymotw.com/3/abc/index.html#index-1', 'https://pymotw.com/3/ipaddress/index.html#index-0', 'https://pymotw.com/3/concurrent.futures/index.html#index-0', 'https://pymotw.com/3/pickle/index.html#index-0', 'https://pymotw.com/3/asyncio/index.html#index-0', 'https://pymotw.com/3/collections/index.html#index-0', 'https://pymotw.com/3/contextlib/index.html#index-0', 'https://pymotw.com/3/inspect/index.html#index-0', 'https://pymotw.com/3/sys/imports.html#index-2', 'https://pymotw.com/3/importlib/index.html#index-1', 'https://pymotw.com/3/asyncio/index.html#index-1', 'https://pymotw.com/3/venv/index.html#index-0', 'https://pymotw.com/3/sys/interpreter.html#index-0', 'https://pymotw.com/3/sys/imports.html#index-3', 'https://pymotw.com/3/pathlib/index.html#index-0', 'https://pymotw.com/3/enum/index.html#index-0', 'https://pymotw.com/3/functools/index.html#index-0', 'https://pymotw.com/3/statistics/index.html#index-0', 'https://pymotw.com/3/ensurepip/index.html#index-0', 'https://pymotw.com/3/porting_notes.html#index-60', 'https://pymotw.com/3/signal/index.html#index-0', 'https://pymotw.com/3/math/index.html#index-0', 'https://pymotw.com/3/importlib/index.html#index-2', 'https://pymotw.com/3/asyncio/index.html#index-2', 'https://pymotw.com/3/collections/index.html#index-1', 'https://pymotw.com/3/porting_notes.html#index-69', 'https://pymotw.com/3/queue/index.html#module-queue', 'https://pymotw.com/3/porting_notes.html#index-56', 'https://pymotw.com/3/random/index.html#module-random', 'https://pymotw.com/3/porting_notes.html#index-12', 'https://pymotw.com/3/porting_notes.html#index-57', 'https://pymotw.com/3/re/index.html#module-re', 'https://pymotw.com/3/readline/index.html#module-readline', 'https://pymotw.com/3/porting_notes.html#index-0', 'https://pymotw.com/3/resource/index.html#module-resource', 'https://pymotw.com/3/urllib.parse/index.html#index-1', 'https://pymotw.com/3/urllib.parse/index.html#index-2', 'https://pymotw.com/3/smtplib/index.html#index-1', 'https://pymotw.com/3/base64/index.html#index-1', 'https://pymotw.com/3/hmac/index.html#index-0', 'https://pymotw.com/3/hmac/index.html#index-1', 'https://pymotw.com/3/http.cookies/index.html#index-0', 'https://pymotw.com/3/http.cookies/index.html#index-1', 'https://pymotw.com/3/urllib.parse/index.html#index-0', 'https://pymotw.com/3/urllib.parse/index.html#index-3', 'https://pymotw.com/3/smtpd/index.html#index-0', 'https://pymotw.com/3/imaplib/index.html#index-0', 'https://pymotw.com/3/base64/index.html#index-0', 'https://pymotw.com/3/urllib.parse/index.html#index-4', 'https://pymotw.com/3/uuid/index.html#index-0', 'https://pymotw.com/3/uuid/index.html#index-1', 'https://pymotw.com/3/imaplib/index.html#index-1', 'https://pymotw.com/3/smtpd/index.html#index-1', 'https://pymotw.com/3/smtplib/index.html#index-3', 'https://pymotw.com/3/http.server/index.html#index-0', 'https://pymotw.com/3/smtplib/index.html#index-0', 'https://pymotw.com/3/smtplib/index.html#index-2', 'https://pymotw.com/3/sched/index.html#module-sched', 'https://pymotw.com/3/select/index.html#module-select', 'https://pymotw.com/3/selectors/index.html#module-selectors', 'https://pymotw.com/3/porting_notes.html#index-58', 'https://pymotw.com/3/shelve/index.html#module-shelve', 'https://pymotw.com/3/shlex/index.html#module-shlex', 'https://pymotw.com/3/shutil/index.html#module-shutil', 'https://pymotw.com/3/porting_notes.html#index-59', 'https://pymotw.com/3/signal/index.html#module-signal', 'https://pymotw.com/3/porting_notes.html#index-15', 'https://pymotw.com/3/site/index.html#module-site', 'https://pymotw.com/3/site/index.html#module-sitecustomize', 'https://pymotw.com/3/smtpd/index.html#module-smtpd', 'https://pymotw.com/3/smtplib/index.html#module-smtplib', 'https://pymotw.com/3/porting_notes.html#index-61', 'https://pymotw.com/3/socket/index.html#module-socket', 'https://pymotw.com/3/porting_notes.html#index-62', 'https://pymotw.com/3/socketserver/index.html#module-socketserver', 'https://pymotw.com/3/sqlite3/index.html#module-sqlite3', 'https://pymotw.com/3/statistics/index.html#module-statistics', 'https://pymotw.com/3/porting_notes.html#index-63', 'https://pymotw.com/3/string/index.html#module-string', 'https://pymotw.com/3/porting_notes.html#index-64', 'https://pymotw.com/3/struct/index.html#module-struct', 'https://pymotw.com/3/porting_notes.html#index-3', 'https://pymotw.com/3/porting_notes.html#index-30', 'https://pymotw.com/3/porting_notes.html#index-65', 'https://pymotw.com/3/porting_notes.html#index-9', 'https://pymotw.com/3/subprocess/index.html#module-subprocess', 'https://pymotw.com/3/porting_notes.html#index-66', 'https://pymotw.com/3/sys/index.html#module-sys', 'https://pymotw.com/3/sysconfig/index.html#module-sysconfig', 'https://pymotw.com/3/tabnanny/index.html#module-tabnanny', 'https://pymotw.com/3/tarfile/index.html#module-tarfile', 'https://pymotw.com/3/tempfile/index.html#module-tempfile', 'https://pymotw.com/3/textwrap/index.html#module-textwrap', 'https://pymotw.com/3/porting_notes.html#index-68', 'https://pymotw.com/3/porting_notes.html#index-14', 'https://pymotw.com/3/porting_notes.html#index-68', 'https://pymotw.com/3/threading/index.html#module-threading', 'https://pymotw.com/3/porting_notes.html#index-70', 'https://pymotw.com/3/time/index.html#module-time', 'https://pymotw.com/3/timeit/index.html#module-timeit', 'https://pymotw.com/3/porting_notes.html#index-5', 'https://pymotw.com/3/trace/index.html#module-trace', 'https://pymotw.com/3/traceback/index.html#module-traceback', 'https://pymotw.com/3/porting_notes.html#index-71', 'https://pymotw.com/3/unittest/index.html#module-unittest', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://pymotw.com/3/urllib.robotparser/index.html#module-urllib.robotparser', 'https://pymotw.com/3/site/index.html#module-usercustomize', 'https://pymotw.com/3/porting_notes.html#index-72', 'https://pymotw.com/3/porting_notes.html#index-72', 'https://pymotw.com/3/porting_notes.html#index-72', 'https://pymotw.com/3/porting_notes.html#index-73', 'https://pymotw.com/3/uuid/index.html#module-uuid', 'https://pymotw.com/3/venv/index.html#module-venv', 'https://pymotw.com/3/warnings/index.html#module-warnings', 'https://pymotw.com/3/weakref/index.html#module-weakref', 'https://pymotw.com/3/webbrowser/index.html#module-webbrowser', 'https://pymotw.com/3/porting_notes.html#index-74', 'https://pymotw.com/3/porting_notes.html#index-75', 'https://pymotw.com/3/xml.etree.ElementTree/index.html#module-xml.etree.ElementTree', 'https://pymotw.com/3/xmlrpc.client/index.html#module-xmlrpc.client', 'https://pymotw.com/3/xmlrpc.server/index.html#module-xmlrpc.server', 'https://pymotw.com/3/zipfile/index.html#module-zipfile', 'https://pymotw.com/3/porting_notes.html#index-76', 'https://pymotw.com/3/zipimport/index.html#module-zipimport', 'https://pymotw.com/3/zlib/index.html#module-zlib', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'http://www.twitter.com/pymotw': [], 'https://feeds.feedburner.com/PyMOTW': [], 'https://pymotw.com/3/internet_protocols.html': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/internet_protocols.html#the-internet', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://pymotw.com/3/base64/index.html#module-base64', 'https://pymotw.com/3/urllib.robotparser/index.html#module-urllib.robotparser', 'https://pymotw.com/3/http.server/index.html#module-http.server', 'https://pymotw.com/3/http.cookies/index.html#module-http.cookies', 'https://pymotw.com/3/uuid/index.html#module-uuid', 'https://pymotw.com/3/json/index.html#module-json', 'https://pymotw.com/3/xmlrpc.client/index.html#module-xmlrpc.client', 'https://pymotw.com/3/xmlrpc.server/index.html#module-xmlrpc.server', 'https://pymotw.com/3/urllib.parse/index.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.robotparser/index.html', 'https://pymotw.com/3/base64/index.html', 'https://pymotw.com/3/http.server/index.html', 'https://pymotw.com/3/http.cookies/index.html', 'https://pymotw.com/3/webbrowser/index.html', 'https://pymotw.com/3/uuid/index.html', 'https://pymotw.com/3/json/index.html', 'https://pymotw.com/3/xmlrpc.client/index.html', 'https://pymotw.com/3/xmlrpc.server/index.html', 'https://pymotw.com/3/socketserver/index.html', 'https://pymotw.com/3/urllib.parse/index.html', 'https://pymotw.com/3/socketserver/index.html', 'https://pymotw.com/3/urllib.parse/index.html', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/2/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#id1', 'https://pymotw.com/3/urllib.parse/index.html#id2', 'https://tools.ietf.org/html/rfc2396.html', 'https://pymotw.com/3/urllib.parse/index.html#id3', 'https://pymotw.com/3/urllib.parse/index.html#id4', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#id5', 'https://pymotw.com/3/urllib.parse/index.html#id6', 'https://pymotw.com/3/urllib.parse/index.html#id7', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#id8', 'https://pymotw.com/3/urllib.parse/index.html#id9', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/urllib.parse/index.html#id10', 'https://pymotw.com/3/urllib.parse/index.html#id11', 'https://pymotw.com/3/urllib.parse/index.html#id12', 'https://pymotw.com/3/urllib.parse/index.html#id13', 'https://pymotw.com/3/urllib.parse/index.html#id14', 'https://docs.python.org/3.7/library/urllib.parse.html', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://tools.ietf.org/html/rfc1738.html', 'https://tools.ietf.org/html/rfc1808.html', 'https://tools.ietf.org/html/rfc2396.html', 'https://tools.ietf.org/html/rfc3986.html', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/2/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/urllib.parse/index.html#parsing': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#id1', 'https://pymotw.com/3/urllib.parse/index.html#id2', 'https://tools.ietf.org/html/rfc2396.html', 'https://pymotw.com/3/urllib.parse/index.html#id3', 'https://pymotw.com/3/urllib.parse/index.html#id4', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#id5', 'https://pymotw.com/3/urllib.parse/index.html#id6', 'https://pymotw.com/3/urllib.parse/index.html#id7', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#id8', 'https://pymotw.com/3/urllib.parse/index.html#id9', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/urllib.parse/index.html#id10', 'https://pymotw.com/3/urllib.parse/index.html#id11', 'https://pymotw.com/3/urllib.parse/index.html#id12', 'https://pymotw.com/3/urllib.parse/index.html#id13', 'https://pymotw.com/3/urllib.parse/index.html#id14', 'https://docs.python.org/3.7/library/urllib.parse.html', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://tools.ietf.org/html/rfc1738.html', 'https://tools.ietf.org/html/rfc1808.html', 'https://tools.ietf.org/html/rfc2396.html', 'https://tools.ietf.org/html/rfc3986.html', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/2/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/urllib.parse/index.html#id1': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#id1', 'https://pymotw.com/3/urllib.parse/index.html#id2', 'https://tools.ietf.org/html/rfc2396.html', 'https://pymotw.com/3/urllib.parse/index.html#id3', 'https://pymotw.com/3/urllib.parse/index.html#id4', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#id5', 'https://pymotw.com/3/urllib.parse/index.html#id6', 'https://pymotw.com/3/urllib.parse/index.html#id7', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#id8', 'https://pymotw.com/3/urllib.parse/index.html#id9', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/urllib.parse/index.html#id10', 'https://pymotw.com/3/urllib.parse/index.html#id11', 'https://pymotw.com/3/urllib.parse/index.html#id12', 'https://pymotw.com/3/urllib.parse/index.html#id13', 'https://pymotw.com/3/urllib.parse/index.html#id14', 'https://docs.python.org/3.7/library/urllib.parse.html', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://tools.ietf.org/html/rfc1738.html', 'https://tools.ietf.org/html/rfc1808.html', 'https://tools.ietf.org/html/rfc2396.html', 'https://tools.ietf.org/html/rfc3986.html', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/2/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/'], 'https://pymotw.com/3/urllib.parse/index.html#id2': ['https://pymotw.com/3/index.html', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.parse/index.html#module-urllib.parse', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#id1', 'https://pymotw.com/3/urllib.parse/index.html#id2', 'https://tools.ietf.org/html/rfc2396.html', 'https://pymotw.com/3/urllib.parse/index.html#id3', 'https://pymotw.com/3/urllib.parse/index.html#id4', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#id5', 'https://pymotw.com/3/urllib.parse/index.html#id6', 'https://pymotw.com/3/urllib.parse/index.html#id7', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#id8', 'https://pymotw.com/3/urllib.parse/index.html#id9', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/urllib.parse/index.html#id10', 'https://pymotw.com/3/urllib.parse/index.html#id11', 'https://pymotw.com/3/urllib.parse/index.html#id12', 'https://pymotw.com/3/urllib.parse/index.html#id13', 'https://pymotw.com/3/urllib.parse/index.html#id14', 'https://docs.python.org/3.7/library/urllib.parse.html', 'https://pymotw.com/3/urllib.request/index.html#module-urllib.request', 'https://tools.ietf.org/html/rfc1738.html', 'https://tools.ietf.org/html/rfc1808.html', 'https://tools.ietf.org/html/rfc2396.html', 'https://tools.ietf.org/html/rfc3986.html', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://pymotw.com/3/urllib.parse/index.html#parsing', 'https://pymotw.com/3/urllib.parse/index.html#unparsing', 'https://pymotw.com/3/urllib.parse/index.html#joining', 'https://pymotw.com/3/urllib.parse/index.html#encoding-query-arguments', 'https://pymotw.com/3/internet_protocols.html', 'https://pymotw.com/3/urllib.request/index.html', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/', 'https://pymotw.com/2/', 'https://pymotw.com/3/py-modindex.html', 'https://pymotw.com/3/genindex.html', 'https://pymotw.com/3/index.html', 'https://pymotw.com/3/about.html', 'http://www.twitter.com/pymotw', 'https://feeds.feedburner.com/PyMOTW', 'http://feedburner.google.com/fb/a/mailverify?uri=PyMOTW&loc=en_US', 'https://pymotw.com/3/about.html', 'http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_US', 'https://doughellmann.com/', 'https://doughellmann.com/books/the-python-3-standard-library-by-example/']}
len(results['https://pymotw.com/3/urllib.parse/index.html'])
53
len(results)
11
class WebCrawler_v2:
    def __init__(self, startUrl: str):
        self.startUrl = startUrl
        self.robots_cache = {}
        self.allowed_domain = urlparse(startUrl).netloc

    def _is_same_domain(self, url: str) -> bool:
        return urlparse(url).netloc == self.allowed_domain

    def _is_crawlable(self, url: str) -> bool:
        parsed = urlparse(url)
        return parsed.scheme in ["http", "https"] and bool(parsed.netloc)

    def _get_robots(self, url: str) -> RobotFileParser:
        parsed = urlparse(url)
        base = f"{parsed.scheme}://{parsed.netloc}"

        if base not in self.robots_cache:
            rp = RobotFileParser()
            rp.set_url(f"{base}/robots.txt")
            try:
                rp.read()
            except Exception:
                pass # if robots.txt not found, assume allowed
            self.robots_cache[base] = rp

        return self.robots_cache[base]

    def _is_allowed(self, url: str) -> bool:
        rp = self._get_robots(url)
        return rp.can_fetch("*", url)

    def _extract_links(self, url: str) -> list[str]:
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")

            links = []

            for tag in soup.find_all("a", href=True):
                href = tag["href"]
                absolute = urljoin(url, href)
                if self._is_crawlable(absolute):
                    links.append(absolute)

            return links
        except:
            return []

    def crawl(self, limit: int = 1000) -> dict[str, list[str]]:
        queue = deque()
        # visited = set([self.startUrl])
        visited = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH, error_rate=0.001)
        sitemap = {}
        count = 0

        visited.add(self.startUrl)
        queue.append(self.startUrl)

        while queue and count < limit:
            url = queue.popleft()
            if not self._is_allowed(url):
                continue
            ## we need to do something with Crawl-delay probably

            links = self._extract_links(url)
            sitemap[url] = links
            for link in links:
                if link not in visited and self._is_same_domain(link):
                    queue.append(link)
                    visited.add(link)

            count += 1

        return sitemap
wc2 = WebCrawler_v2('https://pymotw.com/3/urllib.parse/index.html')
results = wc2.crawl(limit=10)
print(results)
len(results)
10
class WebCrawler_v3:
    def __init__(self, startUrl: str):
        self.startUrl = startUrl

    def crawl(self, limit: int = 1000) -> dict[str, list[str]]:
        pass
  1. Use trafilatura for proper extraction
  2. Deduplication with MinHash LSH for fuzzy/near-duplicate detection
  3. SHA-256 for exact dedup
#!pip install trafilatura
#from trafilatura import fetch_url, extract
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url('https://pymotw.com/3/urllib.parse/index.html')
rp.read()
rp.can_fetch('*', 'https://pymotw.com/3/urllib.parse/index.html#parsing')
True
rp.can_fetch('*', 'https://pymotw.com/3/internet_protocols.html')
True
rp.crawl_delay('*')