Installation of Scrapy

Once you have your Python3 dev environment setup the following will install Scrapy:

sudo -H pip3 install scrapy

Installation of Scrapyd

scrapyd provides an API to start, stop, schadule, etc. crawlers.

adduser --system --home /var/lib/scrapyd --gecos "scrapy" --no-create-home \
        --disabled-password --quiet scrapy
mkdir -p /var/lib/scrapyd \
         /var/log/scrapyd \
         /var/lib/scrapyd/eggs \
         /var/lib/scrapyd/dbs \
chown scrapy:nogroup /var/log/scrapyd /var/lib/scrapyd \
         /var/lib/scrapyd/eggs \
         /var/lib/scrapyd/dbs \
sudo apt-get -y install libjpeg-dev libfreetype6-dev zlib1g-dev libpng12-dev curl
pip install scrapyd

Next create a service file to start/stop scrapyd via systemd

root@scrapyd:/usr/lib/systemd/user# cat /lib/systemd/system/scrapyd.service
Description=Scrapyd service



Enable the service

root@scrapyd:~# systemctl enable scrapyd.service
Created symlink from /etc/systemd/system/ to /lib/systemd/system/scrapyd.service.

The configuration of scrapyd is done in /etc/scrapyd/scrapyd.conf

root@scrapyd:~# cat /etc/scrapyd/scrapyd.conf
eggs_dir    = /var/lib/scrapyd/eggs
logs_dir    = /var/log/scrapyd/
items_dir   = /var/lib/scrapyd/item
jobs_to_keep = 5
dbs_dir     = dbs
max_proc    = 0
max_proc_per_cpu = 4
finished_to_keep = 100
poll_interval = 5.0
bind_address =
http_port   = 6800
debug       = off
runner      = scrapyd.runner
application =
launcher    = scrapyd.launcher.Launcher
webroot     =

schedule.json     = scrapyd.webservice.Schedule
cancel.json       = scrapyd.webservice.Cancel
addversion.json   = scrapyd.webservice.AddVersion
listprojects.json = scrapyd.webservice.ListProjects
listversions.json = scrapyd.webservice.ListVersions
listspiders.json  = scrapyd.webservice.ListSpiders
delproject.json   = scrapyd.webservice.DeleteProject
delversion.json   = scrapyd.webservice.DeleteVersion
listjobs.json     = scrapyd.webservice.ListJobs
daemonstatus.json = scrapyd.webservice.DaemonStatus