From 738d43adea4b7fea27efacb3e113f1d9cfdd4a13 Mon Sep 17 00:00:00 2001 From: dvdrw Date: Wed, 4 Oct 2023 15:54:07 +0200 Subject: [PATCH] feat: fan out scrapes over time, support different scrape intervals This should help avoid rate-limits on upstream, as well as let admins determine update latency manually. --- main/main.go | 28 ++++++++++++++++++++-------- scraper/scrape.go | 4 +++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/main/main.go b/main/main.go index f3dabff..fde440d 100644 --- a/main/main.go +++ b/main/main.go @@ -9,6 +9,7 @@ import ( "os" "strconv" "strings" + "time" "git.dvdrw.dev/nsmarter/scraper/scraper" ) @@ -24,6 +25,7 @@ var chunkSize = 5 var apiEndpoint string = "https://online.nsmart.rs/publicapi/v1/announcement/announcement.php" var apiKey string var limitQuerySize = 0 +var timeoutPeriod = 180 var fillerUrl string = "localhost" func parseEnvars() { @@ -41,6 +43,13 @@ func parseEnvars() { Log.Printf("WARN: Invalid value for CHUNK_SIZE. Falling back to default value (%v)\n", defaultChunkSize) chunkSize = defaultChunkSize } + case "TIMEOUT_PERIOD": + var err error + timeoutPeriod, err = strconv.Atoi(pair[1]) + if err != nil { + Log.Printf("WARN: Invalid value for TIMEOUT_PERIOD. Falling back to default value (%v)\n", 180) + timeoutPeriod = 180 + } case "LIMIT_QUERY_SIZE": var err error limitQuerySize, err = strconv.Atoi(pair[1]) @@ -86,16 +95,19 @@ func main() { } results := make(chan []scraper.ScrapeResult, 200) - for _, chunk := range stationChunks { - go scraper.ScheduleScrape(chunk, - results, - scraper.ApiConfig{Endpoint: apiEndpoint, - Key: apiKey}) - } + go func() { + for _, chunk := range stationChunks { + go scraper.ScheduleScrape(chunk, + results, + scraper.ApiConfig{Endpoint: apiEndpoint, + Key: apiKey, + Timeout: int64(timeoutPeriod)}) + // Fan out scrapes over time so as to avoid upstream rate limits + time.Sleep(time.Millisecond * 100) + } + }() for r := range results { - fmt.Printf("Received data: %#v\n", r) - json, err := json.Marshal(r) if err != nil { fmt.Print("Couldn't serialise struct to JSON: ", err) diff --git a/scraper/scrape.go b/scraper/scrape.go index 5d23803..c5dccc1 100644 --- a/scraper/scrape.go +++ b/scraper/scrape.go @@ -60,6 +60,7 @@ func scrapeRange(s []Station) string { type ApiConfig struct { Endpoint, Key string + Timeout int64 } func grabData(stations []Station, c ApiConfig) (map[string][]map[string]interface{}, error) { @@ -296,6 +297,7 @@ func ScheduleScrape(chunk []Station, c chan []ScrapeResult, a ApiConfig) { } c <- r - time.Sleep(time.Minute * 3) + duration := time.Duration(a.Timeout) * time.Second + time.Sleep(duration) ScheduleScrape(chunk, c, a) }