feat: fan out scrapes over time, support different scrape intervals
This should help avoid rate-limits on upstream, as well as let admins determine update latency manually.
This commit is contained in:
		
							
								
								
									
										28
									
								
								main/main.go
									
									
									
									
									
								
							
							
						
						
									
										28
									
								
								main/main.go
									
									
									
									
									
								
							@@ -9,6 +9,7 @@ import (
 | 
			
		||||
	"os"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	"git.dvdrw.dev/nsmarter/scraper/scraper"
 | 
			
		||||
)
 | 
			
		||||
@@ -24,6 +25,7 @@ var chunkSize = 5
 | 
			
		||||
var apiEndpoint string = "https://online.nsmart.rs/publicapi/v1/announcement/announcement.php"
 | 
			
		||||
var apiKey string
 | 
			
		||||
var limitQuerySize = 0
 | 
			
		||||
var timeoutPeriod = 180
 | 
			
		||||
var fillerUrl string = "localhost"
 | 
			
		||||
 | 
			
		||||
func parseEnvars() {
 | 
			
		||||
@@ -41,6 +43,13 @@ func parseEnvars() {
 | 
			
		||||
				Log.Printf("WARN: Invalid value for CHUNK_SIZE. Falling back to default value (%v)\n", defaultChunkSize)
 | 
			
		||||
				chunkSize = defaultChunkSize
 | 
			
		||||
			}
 | 
			
		||||
		case "TIMEOUT_PERIOD":
 | 
			
		||||
			var err error
 | 
			
		||||
			timeoutPeriod, err = strconv.Atoi(pair[1])
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				Log.Printf("WARN: Invalid value for TIMEOUT_PERIOD. Falling back to default value (%v)\n", 180)
 | 
			
		||||
				timeoutPeriod = 180
 | 
			
		||||
			}
 | 
			
		||||
		case "LIMIT_QUERY_SIZE":
 | 
			
		||||
			var err error
 | 
			
		||||
			limitQuerySize, err = strconv.Atoi(pair[1])
 | 
			
		||||
@@ -86,16 +95,19 @@ func main() {
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	results := make(chan []scraper.ScrapeResult, 200)
 | 
			
		||||
	for _, chunk := range stationChunks {
 | 
			
		||||
		go scraper.ScheduleScrape(chunk,
 | 
			
		||||
			results,
 | 
			
		||||
			scraper.ApiConfig{Endpoint: apiEndpoint,
 | 
			
		||||
				Key: apiKey})
 | 
			
		||||
	}
 | 
			
		||||
	go func() {
 | 
			
		||||
		for _, chunk := range stationChunks {
 | 
			
		||||
			go scraper.ScheduleScrape(chunk,
 | 
			
		||||
				results,
 | 
			
		||||
				scraper.ApiConfig{Endpoint: apiEndpoint,
 | 
			
		||||
					Key:     apiKey,
 | 
			
		||||
					Timeout: int64(timeoutPeriod)})
 | 
			
		||||
			// Fan out scrapes over time so as to avoid upstream rate limits
 | 
			
		||||
			time.Sleep(time.Millisecond * 100)
 | 
			
		||||
		}
 | 
			
		||||
	}()
 | 
			
		||||
 | 
			
		||||
	for r := range results {
 | 
			
		||||
		fmt.Printf("Received data: %#v\n", r)
 | 
			
		||||
 | 
			
		||||
		json, err := json.Marshal(r)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			fmt.Print("Couldn't serialise struct to JSON: ", err)
 | 
			
		||||
 
 | 
			
		||||
@@ -60,6 +60,7 @@ func scrapeRange(s []Station) string {
 | 
			
		||||
 | 
			
		||||
type ApiConfig struct {
 | 
			
		||||
	Endpoint, Key string
 | 
			
		||||
	Timeout       int64
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func grabData(stations []Station, c ApiConfig) (map[string][]map[string]interface{}, error) {
 | 
			
		||||
@@ -296,6 +297,7 @@ func ScheduleScrape(chunk []Station, c chan []ScrapeResult, a ApiConfig) {
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	c <- r
 | 
			
		||||
	time.Sleep(time.Minute * 3)
 | 
			
		||||
	duration := time.Duration(a.Timeout) * time.Second
 | 
			
		||||
	time.Sleep(duration)
 | 
			
		||||
	ScheduleScrape(chunk, c, a)
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user