Installation of Scrapy¶
Once you have your Python3 dev environment setup the following will install Scrapy:
sudo -H pip3 install scrapy
Installation of Scrapyd¶
scrapyd provides an API to start, stop, schadule, etc. crawlers.
adduser --system --home /var/lib/scrapyd --gecos "scrapy" --no-create-home \
--disabled-password --quiet scrapy
mkdir -p /var/lib/scrapyd \
/var/log/scrapyd \
/var/lib/scrapyd/eggs \
/var/lib/scrapyd/dbs \
/var/lib/scrapyd/items
chown scrapy:nogroup /var/log/scrapyd /var/lib/scrapyd \
/var/lib/scrapyd/eggs \
/var/lib/scrapyd/dbs \
/var/lib/scrapyd/items
sudo apt-get -y install libjpeg-dev libfreetype6-dev zlib1g-dev libpng12-dev curl
pip install scrapyd
Next create a service file to start/stop scrapyd via systemd
root@scrapyd:/usr/lib/systemd/user# cat /lib/systemd/system/scrapyd.service
[Unit]
Description=Scrapyd service
After=network.target
[Service]
User=scrapy
Group=nogroup
WorkingDirectory=/var/lib/scrapyd
ExecStart=/usr/local/bin/scrapyd
Restart=always
[Install]
WantedBy=multi-user.target
Enable the service
root@scrapyd:~# systemctl enable scrapyd.service
Created symlink from /etc/systemd/system/multi-user.target.wants/scrapyd.service to /lib/systemd/system/scrapyd.service.
The configuration of scrapyd is done in /etc/scrapyd/scrapyd.conf
root@scrapyd:~# cat /etc/scrapyd/scrapyd.conf
[scrapyd]
eggs_dir = /var/lib/scrapyd/eggs
logs_dir = /var/log/scrapyd/
items_dir = /var/lib/scrapyd/item
jobs_to_keep = 5
dbs_dir = dbs
max_proc = 0
max_proc_per_cpu = 4
finished_to_keep = 100
poll_interval = 5.0
bind_address = 0.0.0.0
http_port = 6800
debug = off
runner = scrapyd.runner
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
webroot = scrapyd.website.Root
[services]
schedule.json = scrapyd.webservice.Schedule
cancel.json = scrapyd.webservice.Cancel
addversion.json = scrapyd.webservice.AddVersion
listprojects.json = scrapyd.webservice.ListProjects
listversions.json = scrapyd.webservice.ListVersions
listspiders.json = scrapyd.webservice.ListSpiders
delproject.json = scrapyd.webservice.DeleteProject
delversion.json = scrapyd.webservice.DeleteVersion
listjobs.json = scrapyd.webservice.ListJobs
daemonstatus.json = scrapyd.webservice.DaemonStatus