Installation
shell
pip install scrapy-playwright
playwright install chromium
settings.py
Replace Scrapy's default HTTP handler for both schemes with the Playwright handler. Define a named context with the proxy credentials.
myspider/settings.pypython
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
PLAYWRIGHT_BROWSER_TYPE = "chromium"
PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": True}
# A named context that all requests can reference.
PLAYWRIGHT_CONTEXTS = {
"default": {
"proxy": {
"server": "http://gw.justproxies.online:8080",
"username": "USER",
"password": "PASS",
}
}
}
Any spider request with meta["playwright_context"] = "default" runs inside this context and exits through a rotating IP.
Rotating proxy per request
To guarantee a fresh IP on every request, create a new context per request. scrapy-playwright tears it down automatically after the response.
myspider/spiders/shop.pypython
import secrets
import scrapy
from scrapy_playwright.page import PageMethod
class RotatingSpider(scrapy.Spider):
name = "rotating"
start_urls = ["https://target.com/listings"]
def start_requests(self):
for url in self.start_urls:
ctx_name = f"ctx-{secrets.token_hex(4)}" # unique per request
yield scrapy.Request(
url,
meta={
"playwright": True,
"playwright_context": ctx_name,
"playwright_context_kwargs": {
"proxy": {
"server": "http://gw.justproxies.online:8080",
"username": "USER",
"password": "PASS",
}
},
"playwright_page_methods": [
PageMethod("wait_for_load_state", "networkidle"),
],
},
)
def parse(self, response):
yield {"url": response.url, "title": response.css("title::text").get()}
Creating a new context per request is more expensive than reusing one — a fresh browser context involves a TLS handshake and page load. Use it when IP diversity matters more than throughput.
Sticky sessions
Bake a session token into the username on the context. Reuse the same context name across a multi-step flow to keep the same exit IP.
myspider/spiders/login.pypython
import secrets
import scrapy
from scrapy_playwright.page import PageMethod
class LoginSpider(scrapy.Spider):
name = "login"
def start_requests(self):
token = secrets.token_hex(8)
ctx_name = f"session-{token}"
proxy = {
"server": "http://gw.justproxies.online:8080",
"username": f"USER-session-{token}",
"password": "PASS",
}
yield scrapy.Request(
"https://target.com/login",
meta={
"playwright": True,
"playwright_context": ctx_name,
"playwright_context_kwargs": {"proxy": proxy},
"playwright_page_methods": [
PageMethod("fill", "#username", "myuser"),
PageMethod("fill", "#password", "mypass"),
PageMethod("click", "button[type=submit]"),
PageMethod("wait_for_load_state", "networkidle"),
],
"session_ctx": ctx_name,
"session_proxy": proxy,
},
callback=self.after_login,
)
def after_login(self, response):
# Reuse the same context — same IP as the login.
yield scrapy.Request(
"https://target.com/dashboard",
meta={
"playwright": True,
"playwright_context": response.meta["session_ctx"],
"playwright_context_kwargs": {
"proxy": response.meta["session_proxy"]
},
},
callback=self.parse_dashboard,
)
def parse_dashboard(self, response):
...
Page methods and JS rendering
PageMethod calls Playwright page methods before Scrapy receives the response body. Common use cases:
PageMethod("wait_for_selector", ".results")— wait for a CSS selector to appear before parsing.PageMethod("evaluate", "window.scrollTo(0, document.body.scrollHeight)")— scroll to trigger lazy-load.PageMethod("click", "button.load-more")— click a pagination control before capturing the page.
Playwright requests consume more bandwidth than bare HTTP — a browser fetches fonts, images, and analytics scripts. If you only need the HTML, set
PageMethod("route", "**/*.{png,jpg,gif,woff2}", lambda r: r.abort()) to block non-essential asset requests and reduce GB spend.