Format

all scrapy crawlers should create a json with the following format. The fieldnames are defined by the JobEntity of the YAWIK Project

simple-import-schema.json

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
{
    "$id": "https://scrapy-docs.yawik.org/build/html/_downloads/simple-import-schema.json",
    "$schema": "http://json-schema.org/draft-07/schema#",
    "description": "Defines a List oft Job Postings",
    "type": "object",
    "properties": {
        "jobs": {
            "description": "uniq identifier of the job posting",
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "id": {
                        "description": "uniq identifier of the job posting",
                        "type": "string"
                    },
                    "title": {
                        "description": "title of a job posting",
                        "type": "string"
                    },
                    "location": {
                        "description": "location of a job posting",
                        "type": "string"
                    },
                    "company": {
                        "description": "name of the company",
                        "type": "string"
                    },
                    "reference": {
                        "reference": "reference of the job posting used by the hiring organization",
                        "type": "string"
                    },
                    "contactEmail": {
                        "description": "email address for applications (if available)",
                        "type": "string"
                    },
                    "language": {
                        "description": "language of the job posting",
                        "type": "string"
                    },
                    "link": {
                        "description": "link to the detail page of the job posting",
                        "type": "string"
                    },
                    "datePublishStart": {
                        "type": "string",
                        "description": "date of the job posting (date format DD.MM.YYYY)"
                    },
                    "datePublishEnd": {
                        "description": "End date of the job posting (date format DD.MM.YYYY)",
                        "type": "string"
                    },
                    "logoRef": {
                        "description": "link to a logo of the hiring organization",
                        "type": "string"
                    },
                    "linkApply": {
                        "description": "link which references an application form",
                        "type": "string"
                    },
                    "classifications": {
                        "type": "object",
                        "properties": {
                            "professions": {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "industries": {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "employmentTypes": {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            }
                        }
                    },
                    "templateValues": {
                        "type": "object",
                        "properties": {
                            "description": {
                                "description": "Introduction of the job advertisement. Usually a description of the company",
                                "type": "string"
                            },
                            "tasks": {
                                "description": "Description of the tasks.",
                                "type": "string"
                            },
                            "requirements": {
                                "description": "Description of the requirements or qualifications",
                                "type": "string"
                            },
                            "benefits": {
                                "description": "",
                                "type": "string"
                            },
                            "html": {
                                "type": "string"
                            }
                        }
                    }
                },
                "required": [
                    "id",
                    "title",
                    "link"
                ]
            }       
        }     
    }
}

Example:

{
   "jobs": [
       "id": Example-123/456,    // must not contain dots
       "title": "title of the job posting",
       "location": "location of the job posting",
       "company": "name of the hiring organization",
       "description":
       "reference": "reference of the job posting used by the hiring organization",
       "contactEmail": "email address for applications (if available)",
       "language": "language of the job posting",
       "link": "link to the detail page of the job posting",
       "datePublishStart": "date of the job posting (date format DD.MM.YYYY)",
       "datePublishEnd": "End date of the job posting (date format DD.MM.YYYY)",
       "logoRef": "link to a logo of the hiring organization",
       "linkApply": "link which references an application form",
       "classifications": {
           "professions": [
               "software-developer",
               "sales manager"
           ],
           "industries": [
               "banking",
               "IT"
           ],
           "employmentTypes": [
               "contract",
               "internship",
               "freelancer"
           ]
       },
       "templateValues":{
           "introduction": "<p></p>",
           "description": "<p>We're a good company<\/p>",
           "tasks":"<b>Your Tasks<\/b><ul><li>Task 1<\/li><li>Task2<\/li><\/ul>",
           "requirements":"<b>Qualifications<\/b><ul><li>requirement 1<\/li><li>requirement 2<\/li<<\/ul>",
           "benefits":"<b>We offer<\/b><ul><li>offer 1<\/li><li>offer 2<\/li><\/ul>",
           "boilerplate": "<p></p>"
           "html": "<p>complete html<\/p>"
       }
   ],[
       .....
   ]

}
field  
id unique identifier. Must not contain dots ‘.’
title title of the job posting
location location of the job posting
link link to the detail page of the job posting

The fields id, title and link are required. All other fields are optional. If the data can be crawled, put them into the described JSON format

The fields datePublishStart and datePublishEnd should be in the format DD.MM.YYYY

basis crawlers

A Basis crawler requires the fields required for a job list. These are:

  • id
  • title
  • link
  • location (if available in the overview)

A basic crawler is created to show that crawling is basically possible.

full crawler

A Full crawler should contain all data, which are a available using the job listing and the job detail page. Full crawlers are created when it is clear who pays for the work.