urllib.robotparser
The urllib.robotparser module provides a single class, RobotFileParser, for parsing and interpreting robots.txt files. This module is essential for web scraping applications that need to respect website crawling policies.
Quick Reference
Module: urllib.robotparser
Class: RobotFileParser
Purpose: Parse and interpret robots.txt files to determine crawling permissions
Documentation: urllib.robotparser
RFC: Robots Exclusion Standard
RobotFileParser Class
Constructor
urllib.robotparser.RobotFileParser(url='')
Parameters:
url(str, optional): URL of the robots.txt file to parse
import urllib.robotparser
# Create parser with URL
rp = urllib.robotparser.RobotFileParser('https://example.com/robots.txt')
# Create parser without URL (set later)
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://example.com/robots.txt')
Core Methods
URL and Content Management
set_url(url)
Set the URL referring to the robots.txt file.
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://httpbin.org/robots.txt')
read()
Read and parse the robots.txt file from the URL.
rp = urllib.robotparser.RobotFileParser('https://httpbin.org/robots.txt')
rp.read() # Fetches and parses the robots.txt file
set_opener(opener)
Set a custom URL opener (urllib.request.OpenerDirector instance).
import urllib.request
# Create custom opener with authentication
opener = urllib.request.build_opener(
urllib.request.HTTPBasicAuthHandler()
)
rp = urllib.robotparser.RobotFileParser()
rp.set_opener(opener)
rp.set_url('https://example.com/robots.txt')
rp.read()
Parsing Methods
can_fetch(useragent, url)
Check if the given user agent can fetch the specified URL.
rp = urllib.robotparser.RobotFileParser('https://httpbin.org/robots.txt')
rp.read()
# Check if a user agent can fetch a URL
allowed = rp.can_fetch('*', 'https://httpbin.org/status/200')
print(f"Can fetch: {allowed}")
# Check for specific user agent
allowed = rp.can_fetch('MyBot/1.0', 'https://httpbin.org/admin/')
print(f"MyBot can fetch admin: {allowed}")
mtime()
Return the time the robots.txt file was last fetched.
rp = urllib.robotparser.RobotFileParser('https://httpbin.org/robots.txt')
rp.read()
last_modified = rp.mtime()
print(f"Last fetched: {last_modified}")
modified()
Set the time the robots.txt file was last fetched to the current time.
rp.modified() # Sets last modified time to now
crawl_delay(useragent)
Return the crawl delay for the given user agent, or None if not specified.
delay = rp.crawl_delay('MyBot/1.0')
if delay:
print(f"Crawl delay: {delay} seconds")
else:
print("No crawl delay specified")
request_rate(useragent)
Return the request rate for the given user agent as a (requests, seconds) tuple.
rate = rp.request_rate('MyBot/1.0')
if rate:
requests, seconds = rate
print(f"Request rate: {requests} requests per {seconds} seconds")
else:
print("No request rate specified")
site_maps()
Return the sitemap URLs specified in the robots.txt file.
sitemaps = rp.site_maps()
for sitemap in sitemaps:
print(f"Sitemap: {sitemap}")
Practical Examples
1. Basic Robots.txt Checking
import urllib.robotparser
def can_crawl_url(robots_url, user_agent, target_url):
"""Check if a URL can be crawled by a specific user agent."""
try:
rp = urllib.robotparser.RobotFileParser(robots_url)
rp.read()
return rp.can_fetch(user_agent, target_url)
except Exception as e:
print(f"Error reading robots.txt: {e}")
return False # Conservative approach - don't crawl if unsure
# Example usage
robots_url = 'https://www.python.org/robots.txt'
user_agent = 'MyWebCrawler/1.0'
target_url = 'https://www.python.org/downloads/'
allowed = can_crawl_url(robots_url, user_agent, target_url)
print(f"Can crawl {target_url}: {allowed}")
2. Web Crawler with Robots.txt Compliance
import urllib.robotparser
import urllib.parse
import time
from urllib.request import urlopen
class RobotCompliantCrawler:
"""Web crawler that respects robots.txt rules."""
def __init__(self, user_agent='PythonCrawler/1.0'):
self.user_agent = user_agent
self.robots_cache = {} # Cache robots.txt parsers
self.last_crawl = {} # Track last crawl time per domain
def _get_robots_parser(self, url):
"""Get or create robots.txt parser for the domain."""
parsed_url = urllib.parse.urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
if domain not in self.robots_cache:
robots_url = f"{domain}/robots.txt"
try:
rp = urllib.robotparser.RobotFileParser(robots_url)
rp.read()
self.robots_cache[domain] = rp
print(f"Loaded robots.txt for {domain}")
except Exception as e:
print(f"Could not load robots.txt for {domain}: {e}")
# Create empty parser that allows everything
rp = urllib.robotparser.RobotFileParser()
self.robots_cache[domain] = rp
return self.robots_cache[domain]
def can_fetch(self, url):
"""Check if URL can be fetched according to robots.txt."""
rp = self._get_robots_parser(url)
return rp.can_fetch(self.user_agent, url)
def get_crawl_delay(self, url):
"""Get required crawl delay for domain."""
rp = self._get_robots_parser(url)
delay = rp.crawl_delay(self.user_agent)
return delay if delay is not None else 1.0 # Default 1 second
def get_request_rate(self, url):
"""Get request rate limits for domain."""
rp = self._get_robots_parser(url)
return rp.request_rate(self.user_agent)
def respect_rate_limit(self, url):
"""Apply rate limiting based on robots.txt and previous requests."""
parsed_url = urllib.parse.urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
# Get required delay
delay = self.get_crawl_delay(url)
# Check last crawl time
if domain in self.last_crawl:
elapsed = time.time() - self.last_crawl[domain]
if elapsed < delay:
sleep_time = delay - elapsed
print(f"Rate limiting: sleeping {sleep_time:.2f} seconds")
time.sleep(sleep_time)
self.last_crawl[domain] = time.time()
def crawl_url(self, url):
"""Crawl a URL while respecting robots.txt."""
# Check if we're allowed to fetch this URL
if not self.can_fetch(url):
print(f"Robots.txt disallows fetching: {url}")
return None
# Apply rate limiting
self.respect_rate_limit(url)
# Fetch the URL
try:
print(f"Crawling: {url}")
response = urlopen(url)
return response.read().decode('utf-8', errors='ignore')
except Exception as e:
print(f"Error crawling {url}: {e}")
return None
def get_sitemaps(self, url):
"""Get sitemap URLs for the domain."""
rp = self._get_robots_parser(url)
return rp.site_maps()
# Example usage
crawler = RobotCompliantCrawler('MyBot/1.0')
# Test URLs
urls = [
'https://www.python.org/',
'https://www.python.org/downloads/',
'https://www.python.org/about/'
]
for url in urls:
content = crawler.crawl_url(url)
if content:
print(f"Successfully crawled {url} ({len(content)} characters)")
# Get sitemaps
sitemaps = crawler.get_sitemaps(url)
if sitemaps:
print(f"Sitemaps for domain: {sitemaps}")
3. Robots.txt Analyzer
import urllib.robotparser
import urllib.parse
class RobotsAnalyzer:
"""Analyze robots.txt files for insights."""
def __init__(self, robots_url):
self.robots_url = robots_url
self.rp = urllib.robotparser.RobotFileParser(robots_url)
try:
self.rp.read()
self.loaded = True
except Exception as e:
print(f"Error loading robots.txt: {e}")
self.loaded = False
def analyze(self):
"""Perform comprehensive analysis of robots.txt file."""
if not self.loaded:
return {"error": "Could not load robots.txt file"}
analysis = {
"url": self.robots_url,
"last_modified": self.rp.mtime(),
"sitemaps": list(self.rp.site_maps()),
"user_agents": self._analyze_user_agents(),
"common_paths": self._test_common_paths(),
"crawl_policies": self._analyze_crawl_policies()
}
return analysis
def _analyze_user_agents(self):
"""Analyze different user agent permissions."""
test_agents = [
'*',
'Googlebot',
'Bingbot',
'Slurp',
'DuckDuckBot',
'facebookexternalhit',
'Twitterbot',
'BadBot'
]
agent_analysis = {}
for agent in test_agents:
agent_analysis[agent] = {
"crawl_delay": self.rp.crawl_delay(agent),
"request_rate": self.rp.request_rate(agent),
}
return agent_analysis
def _test_common_paths(self):
"""Test access to common website paths."""
base_url = self.robots_url.replace('/robots.txt', '')
common_paths = [
'/',
'/admin/',
'/wp-admin/',
'/private/',
'/api/',
'/login/',
'/search',
'/images/',
'/css/',
'/js/',
'/sitemap.xml',
'/sitemap/',
]
path_results = {}
for path in common_paths:
full_url = base_url + path
path_results[path] = {
"allowed_for_star": self.rp.can_fetch('*', full_url),
"allowed_for_googlebot": self.rp.can_fetch('Googlebot', full_url),
"allowed_for_custom": self.rp.can_fetch('MyBot/1.0', full_url)
}
return path_results
def _analyze_crawl_policies(self):
"""Analyze overall crawling policies."""
policies = {
"has_crawl_delays": False,
"has_request_rates": False,
"has_sitemaps": len(list(self.rp.site_maps())) > 0,
"restrictive_level": "unknown"
}
# Check for crawl delays and request rates
test_agents = ['*', 'Googlebot', 'MyBot/1.0']
for agent in test_agents:
if self.rp.crawl_delay(agent) is not None:
policies["has_crawl_delays"] = True
if self.rp.request_rate(agent) is not None:
policies["has_request_rates"] = True
# Estimate restrictiveness
base_url = self.robots_url.replace('/robots.txt', '')
test_urls = [base_url + path for path in ['/', '/admin/', '/api/']]
allowed_count = sum(1 for url in test_urls
if self.rp.can_fetch('*', url))
if allowed_count == len(test_urls):
policies["restrictive_level"] = "permissive"
elif allowed_count == 0:
policies["restrictive_level"] = "restrictive"
else:
policies["restrictive_level"] = "moderate"
return policies
def print_analysis(self):
"""Print formatted analysis results."""
analysis = self.analyze()
if "error" in analysis:
print(f"Analysis failed: {analysis['error']}")
return
print(f"🤖 Robots.txt Analysis for: {analysis['url']}")
print(f"📅 Last Modified: {analysis['last_modified']}")
print()
# Sitemaps
print("🗺️ Sitemaps:")
if analysis['sitemaps']:
for sitemap in analysis['sitemaps']:
print(f" • {sitemap}")
else:
print(" None specified")
print()
# Crawl policies
policies = analysis['crawl_policies']
print("📋 Crawl Policies:")
print(f" • Restrictiveness: {policies['restrictive_level']}")
print(f" • Has crawl delays: {policies['has_crawl_delays']}")
print(f" • Has request rates: {policies['has_request_rates']}")
print(f" • Provides sitemaps: {policies['has_sitemaps']}")
print()
# User agent analysis
print("🔍 User Agent Analysis:")
for agent, data in analysis['user_agents'].items():
delay = data['crawl_delay']
rate = data['request_rate']
print(f" • {agent}:")
if delay:
print(f" - Crawl delay: {delay}s")
if rate:
print(f" - Request rate: {rate[0]} req/{rate[1]}s")
if not delay and not rate:
print(" - No specific restrictions")
print()
# Path permissions
print("🛣️ Common Path Permissions:")
for path, perms in analysis['common_paths'].items():
print(f" • {path}")
print(f" - * (wildcard): {'✅' if perms['allowed_for_star'] else '❌'}")
print(f" - Googlebot: {'✅' if perms['allowed_for_googlebot'] else '❌'}")
# Example usage
analyzer = RobotsAnalyzer('https://www.python.org/robots.txt')
analyzer.print_analysis()
4. Bulk Robots.txt Checker
import urllib.robotparser
import concurrent.futures
import urllib.parse
def check_robots_compliance(site_url, user_agent='*', paths=None):
"""Check robots.txt compliance for multiple paths on a site."""
if paths is None:
paths = ['/', '/admin/', '/api/', '/search']
parsed_url = urllib.parse.urlparse(site_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
robots_url = f"{base_url}/robots.txt"
try:
rp = urllib.robotparser.RobotFileParser(robots_url)
rp.read()
results = {
"site": base_url,
"robots_url": robots_url,
"accessible": True,
"crawl_delay": rp.crawl_delay(user_agent),
"request_rate": rp.request_rate(user_agent),
"sitemaps": list(rp.site_maps()),
"path_permissions": {}
}
for path in paths:
full_url = base_url + path
results["path_permissions"][path] = rp.can_fetch(user_agent, full_url)
return results
except Exception as e:
return {
"site": base_url,
"robots_url": robots_url,
"accessible": False,
"error": str(e),
"path_permissions": {}
}
def bulk_robots_check(sites, user_agent='MyBot/1.0', max_workers=5):
"""Check robots.txt for multiple sites concurrently."""
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_site = {
executor.submit(check_robots_compliance, site, user_agent): site
for site in sites
}
results = []
for future in concurrent.futures.as_completed(future_to_site):
site = future_to_site[future]
try:
result = future.result()
results.append(result)
except Exception as e:
results.append({
"site": site,
"accessible": False,
"error": f"Future failed: {e}"
})
return results
# Example usage
sites = [
'https://www.python.org',
'https://docs.python.org',
'https://pypi.org',
'https://github.com',
'https://stackoverflow.com'
]
print("🔍 Checking robots.txt compliance for multiple sites...")
results = bulk_robots_check(sites)
for result in results:
print(f"\n🌐 Site: {result['site']}")
if result['accessible']:
print("✅ Robots.txt accessible")
if result['crawl_delay']:
print(f"⏱️ Crawl delay: {result['crawl_delay']}s")
if result['sitemaps']:
print(f"🗺️ Sitemaps: {len(result['sitemaps'])}")
print("📂 Path permissions:")
for path, allowed in result['path_permissions'].items():
status = '✅' if allowed else '❌'
print(f" {status} {path}")
else:
print(f"❌ Error: {result.get('error', 'Unknown error')}")
Performance Considerations
Caching Robots.txt Files
import time
import threading
class CachedRobotsParser:
"""Thread-safe cached robots.txt parser."""
def __init__(self, cache_ttl=3600): # 1 hour default TTL
self.cache = {}
self.cache_ttl = cache_ttl
self.lock = threading.Lock()
def get_parser(self, robots_url):
"""Get cached or fresh robots parser."""
with self.lock:
now = time.time()
# Check cache
if robots_url in self.cache:
parser, timestamp = self.cache[robots_url]
if now - timestamp < self.cache_ttl:
return parser
# Create new parser
try:
parser = urllib.robotparser.RobotFileParser(robots_url)
parser.read()
self.cache[robots_url] = (parser, now)
return parser
except Exception as e:
print(f"Error loading {robots_url}: {e}")
return None
def can_fetch(self, robots_url, user_agent, url):
"""Check if URL can be fetched with caching."""
parser = self.get_parser(robots_url)
if parser:
return parser.can_fetch(user_agent, url)
return False # Conservative default
# Global cached parser instance
cached_parser = CachedRobotsParser()
Error Handling and Edge Cases
Robust Robots.txt Handling
import urllib.robotparser
import urllib.error
import socket
def safe_robots_check(url, user_agent='*', timeout=10):
"""Safely check robots.txt with comprehensive error handling."""
try:
parsed_url = urllib.parse.urlparse(url)
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
# Set socket timeout
old_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
try:
rp = urllib.robotparser.RobotFileParser(robots_url)
rp.read()
result = rp.can_fetch(user_agent, url)
return {
"allowed": result,
"robots_url": robots_url,
"status": "success",
"crawl_delay": rp.crawl_delay(user_agent),
"request_rate": rp.request_rate(user_agent)
}
finally:
socket.setdefaulttimeout(old_timeout)
except urllib.error.HTTPError as e:
if e.code == 404:
# No robots.txt file - allow crawling
return {
"allowed": True,
"status": "no_robots_txt",
"message": "No robots.txt file found - allowing by default"
}
else:
return {
"allowed": False,
"status": "http_error",
"message": f"HTTP {e.code}: {e.reason}"
}
except urllib.error.URLError as e:
return {
"allowed": False,
"status": "url_error",
"message": f"URL error: {e.reason}"
}
except socket.timeout:
return {
"allowed": False,
"status": "timeout",
"message": f"Timeout after {timeout} seconds"
}
except Exception as e:
return {
"allowed": False,
"status": "unknown_error",
"message": f"Unexpected error: {e}"
}
# Example usage
result = safe_robots_check('https://example.com/page', 'MyBot/1.0')
print(f"Can crawl: {result['allowed']}")
print(f"Status: {result['status']}")
if 'message' in result:
print(f"Message: {result['message']}")
Related Components
- urllib.request.OpenerDirector - Custom URL openers for robots.txt fetching
- urllib.request.urlopen - Basic URL opening functionality
- urllib.error - Exception handling for network errors
- urllib.parse - URL parsing for robots.txt URL construction
Additional Resources
- urllib.robotparser documentation
- Robots Exclusion Standard
- Google's robots.txt guidelines
- Web scraping ethics and robots.txt
The urllib.robotparser module is essential for ethical web scraping and automated web access. It ensures your applications respect website owners' crawling preferences as specified in their robots.txt files.