/src/crawler/text_crawler.pconf
load("crawler.proto", "Crawler", "CrawlerService")
def default_crawler():
return Crawler(user_agent="Linux", http_timeout=30)
def main():
crawlers = []
for i in range(3):
crawler = default_crawler()
crawler.http_timeout = 30 + 30*i
if i == 0:
crawler.follow_redirects = True
crawlers.append(crawler)
admins = {'superuser': CrawlerService.AdminPermission.GOD_MODE}
return CrawlerService(crawlers=crawlers, admins=admins, log_level=2)
/src/crawler/crawler.proto
syntax = "proto3";
message Crawler {
string user_agent = 1;
int32 http_timeout = 2;
bool follow_redirects = 3;
}
message CrawlerService {
repeated Crawler crawlers = 1;
enum AdminPermission {
READ_WRITE = 0;
GOD_MODE = 1;
}
map<string, AdminPermission> admins = 2;
int32 log_level = 3;
}
/materialized_config/crawler/text_crawler.materialized_JSON
{
"protoFile": "crawler/crawler.proto",
"value": {
"@type": "type.googleapis.com/CrawlerService",
"admins": {
"superuser": "GOD_MODE"
},
"crawlers": [
{
"userAgent": "Linux",
"httpTimeout": 30,
"followRedirects": true
},
{
"userAgent": "Linux",
"httpTimeout": 60
},
{
"userAgent": "Linux",
"httpTimeout": 90
}
],
"logLevel": 2
}
}
/src/crawler/crawler.proto-validator
load("crawler.proto", "Crawler", "CrawlerService")
def test_crawlers_not_empty(cs):
if len(cs.crawlers) < 1:
fail("Crawlers can't be empty")
add_validator(CrawlerService, test_crawlers_not_empty)
def test_http_timeout(c):
MIN_TIMEOUT = 10
if c.http_timeout < MIN_TIMEOUT:
fail("Crawler HTTP timeout must be at least %d, got %d" % (MIN_TIMEOUT, c.http_timeout))
add_validator(Crawler, test_http_timeout)