From 6bb1b50a2c3d4aa9f41222dbad8b90ad94e0feb0 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Thu, 23 Nov 2023 14:33:34 +0500 Subject: [PATCH 01/27] init commit --- CHANGELOG.md | 0 CODE_OF_CONDUCT.md | 0 go.mod | 3 ++ oxylabs/common.go | 35 ++++++++++++++ oxylabs/types.go | 83 ++++++++++++++++++++++++++++++++ serp/client.go | 46 ++++++++++++++++++ serp/defaults.go | 29 +++++++++++ serp/req.go | 46 ++++++++++++++++++ serp/response.go | 62 ++++++++++++++++++++++++ serp/valid.go | 80 +++++++++++++++++++++++++++++++ serp/yandex.go | 117 +++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 501 insertions(+) create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 go.mod create mode 100644 oxylabs/common.go create mode 100644 oxylabs/types.go create mode 100644 serp/client.go create mode 100644 serp/defaults.go create mode 100644 serp/req.go create mode 100644 serp/response.go create mode 100644 serp/valid.go create mode 100644 serp/yandex.go diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..e69de29 diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..2827079 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/mslmio/oxylabs-sdk-go + +go 1.21.0 diff --git a/oxylabs/common.go b/oxylabs/common.go new file mode 100644 index 0000000..10b7965 --- /dev/null +++ b/oxylabs/common.go @@ -0,0 +1,35 @@ +package oxylabs + +import ( + "fmt" + "net/url" + "strings" +) + +func ValidateURL( + inputURL string, + host string, +) error { + // Parse the URL + parsedURL, err := url.ParseRequestURI(inputURL) + if err != nil { + return fmt.Errorf("failed to parse URL: %v", err) + } + + // Check if the scheme (protocol) is present and non-empty. + if parsedURL.Scheme == "" { + return fmt.Errorf("url is missing scheme.") + } + + // Check if the Host is present and non-empty. + if parsedURL.Host == "" { + return fmt.Errorf("url is missing a host.") + } + + // Check if the Host matches the expected domain/host. + if !strings.Contains(parsedURL.Host, host) { + return fmt.Errorf("url does not belong to %s", host) + } + + return nil +} diff --git a/oxylabs/types.go b/oxylabs/types.go new file mode 100644 index 0000000..dcd6b66 --- /dev/null +++ b/oxylabs/types.go @@ -0,0 +1,83 @@ +package oxylabs + +type UserAgent string + +var ( + UA_DESKTOP UserAgent = "desktop" + UA_DESKTOP_CHROME UserAgent = "desktop_chrome" + UA_DESKTOP_EDGE UserAgent = "desktop_edge" + UA_DESKTOP_FIREFOX UserAgent = "desktop_firefox" + UA_DESKTOP_OPERA UserAgent = "desktop_opera" + UA_DESKTOP_SAFARI UserAgent = "desktop_safari" + UA_MOBILE UserAgent = "mobile" + UA_MOBILE_ANDROID UserAgent = "mobile_android" + UA_MOBILE_IOS UserAgent = "mobile_ios" + UA_TABLET UserAgent = "tablet" + UA_TABLET_ANDROID UserAgent = "tablet_android" + UA_TABLET_IOS UserAgent = "tablet_ios" +) + +func IsUserAgentValid(ua string) bool { + switch UserAgent(ua) { + case + UA_DESKTOP, + UA_DESKTOP_CHROME, + UA_DESKTOP_EDGE, + UA_DESKTOP_FIREFOX, + UA_DESKTOP_OPERA, + UA_DESKTOP_SAFARI, + UA_MOBILE, + UA_MOBILE_ANDROID, + UA_MOBILE_IOS, + UA_TABLET, + UA_TABLET_ANDROID, + UA_TABLET_IOS: + return true + default: + return false + } +} + +type Render string + +var ( + HTML Render = "html" + PNG Render = "png" +) + +func IsRenderValid(render Render) bool { + switch render { + case + HTML, + PNG: + return true + default: + return false + } +} + +type Domain string + +var ( + DOMAIN_COM Domain = "com" + DOMAIN_RU Domain = "ru" + DOMAIN_UA Domain = "ua" + DOMAIN_BY Domain = "by" + DOMAIN_KZ Domain = "kz" + DOMAIN_TR Domain = "tr" +) + +type Locale string + +var ( + LOCALE_EN Locale = "en" + LOCALE_RU Locale = "ru" + LOCALE_BY Locale = "by" + LOCALE_DE Locale = "de" + LOCALE_FR Locale = "fr" + LOCALE_ID Locale = "id" + LOCALE_KK Locale = "kk" + LOCALE_TT Locale = "tt" + LOCALE_TR Locale = "tr" + LOCALE_UK Locale = "uk" +) diff --git a/serp/client.go b/serp/client.go new file mode 100644 index 0000000..68bc750 --- /dev/null +++ b/serp/client.go @@ -0,0 +1,46 @@ +package serp + +import ( + "net/http" +) + +type ApiCredentials struct { + Username string + Password string +} + +type SerpClient struct { + HttpClient *http.Client + ApiCredentials *ApiCredentials + BaseUrl string +} + +// Init for Sync runtime model. +func Init( + username string, + password string, +) *SerpClient { + return &SerpClient{ + ApiCredentials: &ApiCredentials{ + Username: username, + Password: password, + }, + HttpClient: &http.Client{}, + BaseUrl: "https://realtime.oxylabs.io/v1/queries", + } +} + +// Init for Async runtime model. +func InitAsync( + username string, + password string, +) *SerpClient { + return &SerpClient{ + ApiCredentials: &ApiCredentials{ + Username: username, + Password: password, + }, + HttpClient: &http.Client{}, + BaseUrl: "https://data.oxylabs.io/v1/queries", + } +} diff --git a/serp/defaults.go b/serp/defaults.go new file mode 100644 index 0000000..501f1e5 --- /dev/null +++ b/serp/defaults.go @@ -0,0 +1,29 @@ +package serp + +import "github.com/mslmio/oxylabs-sdk-go/oxylabs" + +// Default values for Yandex search source. +func (opt *YandexSearchOpts) setDefaults() { + if opt.Domain == "" { + opt.Domain = oxylabs.DOMAIN_COM + } + if opt.StartPage == 0 { + opt.StartPage = 1 + } + if opt.Pages == 0 { + opt.Pages = 1 + } + if opt.Limit == 0 { + opt.Limit = 10 + } + if opt.UserAgent == "" { + opt.UserAgent = oxylabs.UA_DESKTOP + } +} + +// Default values for Yandex url source. +func (opt *YandexUrlOpts) setDefaults() { + if opt.UserAgent == "" { + opt.UserAgent = oxylabs.UA_DESKTOP + } +} diff --git a/serp/req.go b/serp/req.go new file mode 100644 index 0000000..5a9c5a6 --- /dev/null +++ b/serp/req.go @@ -0,0 +1,46 @@ +package serp + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" +) + +// Request to the API. +func (c *SerpClient) Req( + jsonPayload []byte, +) (*Response, error) { + // Prepare requst. + request, _ := http.NewRequest("POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Set("Content-Type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + + // Get response. + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, err + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := json.Unmarshal(responseBody, resp); err != nil { + return nil, fmt.Errorf("failed to parse JSON object: %v", err) + } + + // Set status. + resp.StatusCode = response.StatusCode + resp.Status = response.Status + + return resp, nil +} diff --git a/serp/response.go b/serp/response.go new file mode 100644 index 0000000..4661027 --- /dev/null +++ b/serp/response.go @@ -0,0 +1,62 @@ +package serp + +type Response struct { + Results []Result `json:"results"` + Job Job `json:"job"` + StatusCode int `json:"status_code"` + Status string `json:"status"` +} + +type Result struct { + Content string `json:"content"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` + Page int `json:"page"` + URL string `json:"url"` + JobID string `json:"job_id"` + StatusCode int `json:"status_code"` +} + +type Job struct { + CallbackURL string `json:"callback_url"` + ClientID int `json:"client_id"` + Context []Context `json:"context"` + CreatedAt string `json:"created_at"` + Domain string `json:"domain"` + GeoLocation string `json:"geo_location"` + ID string `json:"id"` + Limit int `json:"limit"` + Locale string `json:"locale"` + Pages int `json:"pages"` + Parse bool `json:"parse"` + ParserType string `json:"parser_type"` + ParsingInstructions string `json:"parsing_instructions"` + BrowserInstructions string `json:"browser_instructions"` + Render string `json:"render"` + URL string `json:"url"` + Query string `json:"query"` + Source string `json:"source"` + StartPage int `json:"start_page"` + Status string `json:"status"` + StorageType string `json:"storage_type"` + StorageURL string `json:"storage_url"` + Subdomain string `json:"subdomain"` + ContentEncoding string `json:"content_encoding"` + UpdatedAt string `json:"updated_at"` + UserAgentType string `json:"user_agent_type"` + SessionInfo string `json:"session_info"` + Statuses []string `json:"statuses"` + ClientNotes string `json:"client_notes"` + Links []Link `json:"_links"` +} + +type Context struct { + Key string `json:"key"` + Value string `json:"value"` +} + +type Link struct { + Rel string `json:"rel"` + Href string `json:"href"` + Method string `json:"method"` +} diff --git a/serp/valid.go b/serp/valid.go new file mode 100644 index 0000000..22947a5 --- /dev/null +++ b/serp/valid.go @@ -0,0 +1,80 @@ +package serp + +import ( + "fmt" + "reflect" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// Accepted parameters for yandex. +var yandexSearchAcceptedDomainParameters = []oxylabs.Domain{ + oxylabs.DOMAIN_COM, + oxylabs.DOMAIN_RU, + oxylabs.DOMAIN_UA, + oxylabs.DOMAIN_BY, + oxylabs.DOMAIN_KZ, + oxylabs.DOMAIN_TR, +} +var yandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ + oxylabs.LOCALE_EN, + oxylabs.LOCALE_RU, + oxylabs.LOCALE_BY, + oxylabs.LOCALE_DE, + oxylabs.LOCALE_FR, + oxylabs.LOCALE_ID, + oxylabs.LOCALE_KK, + oxylabs.LOCALE_TT, + oxylabs.LOCALE_TR, + oxylabs.LOCALE_UK, +} + +// Function to check validity of yandex search parameters. +func (opt *YandexSearchOpts) checkParameterValidity() error { + + if opt.Domain != "" && !inSlice(opt.Domain, yandexSearchAcceptedDomainParameters) { + return fmt.Errorf("invalid domain parameter: %s", opt.Domain) + } + + if opt.Locale != "" && !inSlice(opt.Locale, yandexSearchAcceptedLocaleParameters) { + return fmt.Errorf("invalid locale parameter: %s", opt.Locale) + } + + if !oxylabs.IsUserAgentValid(string(opt.UserAgent)) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + return nil +} + +// Function to check validity of yandex url parameters. +func (opt *YandexUrlOpts) checkParameterValidity() error { + + if !oxylabs.IsUserAgentValid(string(opt.UserAgent)) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + +// Functions to check other sources wll be here. + +// inSlice checks if a value is in the slice. +func inSlice(val interface{}, list interface{}) bool { + switch reflect.TypeOf(list).Kind() { + case reflect.Slice: + s := reflect.ValueOf(list) + + for i := 0; i < s.Len(); i++ { + if s.Index(i).Interface() == val { + return true + } + } + } + + return false +} diff --git a/serp/yandex.go b/serp/yandex.go new file mode 100644 index 0000000..5be67c8 --- /dev/null +++ b/serp/yandex.go @@ -0,0 +1,117 @@ +package serp + +import ( + "encoding/json" + "fmt" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +type YandexSearchOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + Locale string + GeoLocation string + UserAgent oxylabs.UserAgent + CallbackUrl string +} + +// Scrapes Yandex via its search engine. +func (c *SerpClient) ScrapeYandexSearch( + query string, + opts ...*YandexSearchOpts, +) (*Response, error) { + // Prepare options + opt := &YandexSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + opt.setDefaults() + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "yandex_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + fmt.Printf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload) + if err != nil { + return nil, err + } else { + return res, nil + } +} + +type YandexUrlOpts struct { + UserAgent oxylabs.UserAgent + Render oxylabs.Render + CallbackUrl string +} + +// Scrapes Yandex via provided url. +func (c *SerpClient) ScrapeYandexUrl( + url string, + opts ...*YandexUrlOpts, +) (*Response, error) { + // Check validity of url. + err := oxylabs.ValidateURL(url, "yandex") + if err != nil { + return nil, err + } + + // Prepare options. + opt := &YandexUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + opt.setDefaults() + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return nil, fmt.Errorf("invalid render option: %v", opt.Render) + } + + if !oxylabs.IsUserAgentValid(string(opt.UserAgent)) { + return nil, fmt.Errorf("invalid user agent option: %v", opt.UserAgent) + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "yandex", + "url": url, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + fmt.Printf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload) + if err != nil { + return nil, err + } else { + return res, nil + } + +} From 62e861a7c6e25d6ae10fe5f6bad3975adedfc60a Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Fri, 24 Nov 2023 15:55:57 +0500 Subject: [PATCH 02/27] some improvements + added bing and baidu --- oxylabs/types.go | 5 +- serp/baidu.go | 109 ++++++++++++++++++++++++++++++++++++++++++++ serp/bing.go | 116 +++++++++++++++++++++++++++++++++++++++++++++++ serp/defaults.go | 60 ++++++++++++++---------- serp/valid.go | 72 +++++++++++++++++++++++++++-- serp/yandex.go | 18 ++++---- 6 files changed, 342 insertions(+), 38 deletions(-) create mode 100644 serp/baidu.go create mode 100644 serp/bing.go diff --git a/oxylabs/types.go b/oxylabs/types.go index dcd6b66..0c56a2e 100644 --- a/oxylabs/types.go +++ b/oxylabs/types.go @@ -17,8 +17,8 @@ var ( UA_TABLET_IOS UserAgent = "tablet_ios" ) -func IsUserAgentValid(ua string) bool { - switch UserAgent(ua) { +func IsUserAgentValid(ua UserAgent) bool { + switch ua { case UA_DESKTOP, UA_DESKTOP_CHROME, @@ -65,6 +65,7 @@ var ( DOMAIN_BY Domain = "by" DOMAIN_KZ Domain = "kz" DOMAIN_TR Domain = "tr" + DOMAIN_CN Domain = "cn" ) type Locale string diff --git a/serp/baidu.go b/serp/baidu.go new file mode 100644 index 0000000..c3c9226 --- /dev/null +++ b/serp/baidu.go @@ -0,0 +1,109 @@ +package serp + +import ( + "encoding/json" + "fmt" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +type BaiduSearchOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + UserAgent oxylabs.UserAgent + CallbackUrl string +} + +// Scrapes Baidu via its search engine. +func (c *SerpClient) ScrapeBaiduSearch( + query string, + opts ...*BaiduSearchOpts, +) (*Response, error) { + // Prepare options + opt := &BaiduSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + SetDefaults(opt) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "baidu_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload) + if err != nil { + return nil, err + } else { + return res, nil + } +} + +type BaiduUrlOpts struct { + UserAgent oxylabs.UserAgent + CallbackUrl string +} + +// Scrapes Baidu via its url. +func (c *SerpClient) ScrapeBaiduUrl( + url string, + opts ...*BaiduUrlOpts, +) (*Response, error) { + // Check validity of url. + err := oxylabs.ValidateURL(url, "baidu") + if err != nil { + return nil, err + } + + // Prepare options + opt := &BaiduUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + SetDefaults(opt) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "baidu", + "url": url, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload) + if err != nil { + return nil, err + } else { + return res, nil + } + +} diff --git a/serp/bing.go b/serp/bing.go new file mode 100644 index 0000000..c538552 --- /dev/null +++ b/serp/bing.go @@ -0,0 +1,116 @@ +package serp + +import ( + "encoding/json" + "fmt" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +type BingSearchOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + Locale string + GeoLocation string + UserAgent oxylabs.UserAgent + CallbackUrl string + Render oxylabs.Render +} + +// Scrapes Bing via its search engine. +func (c *SerpClient) ScrapeBingSearch( + query string, + opts ...*BingSearchOpts, +) (*Response, error) { + // Prepare options + opt := &BingSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + SetDefaults(opt) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "bing_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + "render": opt.Render, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload) + if err != nil { + return nil, err + } else { + return res, nil + } +} + +type BingUrlOpts struct { + UserAgent oxylabs.UserAgent + Render oxylabs.Render + CallbackUrl string +} + +// Scrapes Bing via provided url. +func (c *SerpClient) ScrapeBingUrl( + url string, + opts ...*BingUrlOpts, +) (*Response, error) { + // Check validity of url. + err := oxylabs.ValidateURL(url, "bing") + if err != nil { + return nil, err + } + + // Prepare options. + opt := &BingUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + SetDefaults(opt) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "bing", + "url": url, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload) + if err != nil { + return nil, err + } else { + return res, nil + } +} diff --git a/serp/defaults.go b/serp/defaults.go index 501f1e5..94afc6b 100644 --- a/serp/defaults.go +++ b/serp/defaults.go @@ -1,29 +1,43 @@ package serp -import "github.com/mslmio/oxylabs-sdk-go/oxylabs" +import ( + "reflect" -// Default values for Yandex search source. -func (opt *YandexSearchOpts) setDefaults() { - if opt.Domain == "" { - opt.Domain = oxylabs.DOMAIN_COM - } - if opt.StartPage == 0 { - opt.StartPage = 1 - } - if opt.Pages == 0 { - opt.Pages = 1 - } - if opt.Limit == 0 { - opt.Limit = 10 - } - if opt.UserAgent == "" { - opt.UserAgent = oxylabs.UA_DESKTOP - } -} + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// Function to set default values for serp scrapers. +func SetDefaults(opt interface{}) { + val := reflect.ValueOf(opt).Elem() + + // Loop through the fields of the struct. + for i := 0; i < val.NumField(); i++ { + field := val.Field(i) + fieldType := val.Type().Field(i) + + // Set domain. + if fieldType.Name == "Domain" && field.String() == "" { + field.SetString(string(oxylabs.DOMAIN_COM)) + } + + // Set start page. + if fieldType.Name == "StartPage" && field.Int() == 0 { + field.SetInt(1) + } + + // Set pages. + if fieldType.Name == "Pages" && field.Int() == 0 { + field.SetInt(1) + } + + // Set limit. + if fieldType.Name == "Limit" && field.Int() == 0 { + field.SetInt(10) + } -// Default values for Yandex url source. -func (opt *YandexUrlOpts) setDefaults() { - if opt.UserAgent == "" { - opt.UserAgent = oxylabs.UA_DESKTOP + // Set user agent. + if fieldType.Name == "UserAgent" && field.String() == "" { + field.SetString(string(oxylabs.UA_DESKTOP)) + } } } diff --git a/serp/valid.go b/serp/valid.go index 22947a5..82600aa 100644 --- a/serp/valid.go +++ b/serp/valid.go @@ -40,7 +40,7 @@ func (opt *YandexSearchOpts) checkParameterValidity() error { return fmt.Errorf("invalid locale parameter: %s", opt.Locale) } - if !oxylabs.IsUserAgentValid(string(opt.UserAgent)) { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) } @@ -50,7 +50,7 @@ func (opt *YandexSearchOpts) checkParameterValidity() error { // Function to check validity of yandex url parameters. func (opt *YandexUrlOpts) checkParameterValidity() error { - if !oxylabs.IsUserAgentValid(string(opt.UserAgent)) { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) } @@ -61,7 +61,73 @@ func (opt *YandexUrlOpts) checkParameterValidity() error { return nil } -// Functions to check other sources wll be here. +// Accepted parameters for bing. +var BingSearchAcceptedDomainParameters = []oxylabs.Domain{ + oxylabs.DOMAIN_COM, + oxylabs.DOMAIN_RU, + oxylabs.DOMAIN_UA, + oxylabs.DOMAIN_BY, + oxylabs.DOMAIN_KZ, + oxylabs.DOMAIN_TR, +} + +// Function to check validity of bing search parameters. +func (opt *BingSearchOpts) checkParameterValidity() error { + + if opt.Domain != "" && !inSlice(opt.Domain, BingSearchAcceptedDomainParameters) { + return fmt.Errorf("invalid domain parameter: %s", opt.Domain) + } + + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + return nil +} + +// Function to check validity of bing url parameters. +func (opt *BingUrlOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + +// Accepted parameters for baidu. +var BaiduSearchAcceptedDomainParameters = []oxylabs.Domain{ + oxylabs.DOMAIN_COM, + oxylabs.DOMAIN_CN, +} + +// Function to check validity of baidu search parameters. +func (opt *BaiduSearchOpts) checkParameterValidity() error { + if opt.Domain != "" && !inSlice(opt.Domain, BaiduSearchAcceptedDomainParameters) { + return fmt.Errorf("invalid domain parameter: %s", opt.Domain) + } + + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + return nil +} + +// Function to check validity of baidu url parameters. +func (opt *BaiduUrlOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + return nil +} // inSlice checks if a value is in the slice. func inSlice(val interface{}, list interface{}) bool { diff --git a/serp/yandex.go b/serp/yandex.go index 5be67c8..32aa571 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -28,7 +28,7 @@ func (c *SerpClient) ScrapeYandexSearch( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - opt.setDefaults() + SetDefaults(opt) // Check validity of parameters. err := opt.checkParameterValidity() @@ -51,7 +51,7 @@ func (c *SerpClient) ScrapeYandexSearch( } jsonPayload, err := json.Marshal(payload) if err != nil { - fmt.Printf("error marshalling payload: %v", err) + return nil, fmt.Errorf("error marshalling payload: %v", err) } res, err := c.Req(jsonPayload) @@ -84,14 +84,12 @@ func (c *SerpClient) ScrapeYandexUrl( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - opt.setDefaults() + SetDefaults(opt) - if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { - return nil, fmt.Errorf("invalid render option: %v", opt.Render) - } - - if !oxylabs.IsUserAgentValid(string(opt.UserAgent)) { - return nil, fmt.Errorf("invalid user agent option: %v", opt.UserAgent) + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err } // Prepare payload. @@ -104,7 +102,7 @@ func (c *SerpClient) ScrapeYandexUrl( } jsonPayload, err := json.Marshal(payload) if err != nil { - fmt.Printf("error marshalling payload: %v", err) + return nil, fmt.Errorf("error marshalling payload: %v", err) } res, err := c.Req(jsonPayload) From 7ada257b728b0bba3754776c2acda01004354fc3 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 28 Nov 2023 12:36:19 +0500 Subject: [PATCH 03/27] Apply suggestions from code review --- serp/baidu.go | 39 ++++++++++++- serp/bing.go | 55 +++++++++++++++++- serp/defaults.go | 54 ++++++++---------- serp/req.go | 3 +- serp/valid.go | 145 ++--------------------------------------------- serp/yandex.go | 66 ++++++++++++++++++++- 6 files changed, 182 insertions(+), 180 deletions(-) diff --git a/serp/baidu.go b/serp/baidu.go index c3c9226..5da13ab 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -7,6 +7,34 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// Accepted parameters for baidu. +var BaiduSearchAcceptedDomainParameters = []oxylabs.Domain{ + oxylabs.DOMAIN_COM, + oxylabs.DOMAIN_CN, +} + +// checkParameterValidity checks validity of baidu search parameters. +func (opt *BaiduSearchOpts) checkParameterValidity() error { + if !inList(opt.Domain, BaiduSearchAcceptedDomainParameters) { + return fmt.Errorf("invalid domain parameter: %s", opt.Domain) + } + + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + return nil +} + +// checkParameterValidity checks validity of baidu url parameters. +func (opt *BaiduUrlOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + return nil +} + type BaiduSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -26,7 +54,12 @@ func (c *SerpClient) ScrapeBaiduSearch( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - SetDefaults(opt) + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. err := opt.checkParameterValidity() @@ -79,7 +112,9 @@ func (c *SerpClient) ScrapeBaiduUrl( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - SetDefaults(opt) + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. err = opt.checkParameterValidity() diff --git a/serp/bing.go b/serp/bing.go index c538552..e660e6e 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -7,12 +7,51 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// Accepted parameters for bing. +var BingSearchAcceptedDomainParameters = []oxylabs.Domain{ + oxylabs.DOMAIN_COM, + oxylabs.DOMAIN_RU, + oxylabs.DOMAIN_UA, + oxylabs.DOMAIN_BY, + oxylabs.DOMAIN_KZ, + oxylabs.DOMAIN_TR, +} + +// checkParameterValidity checks validity of bing search parameters. +func (opt *BingSearchOpts) checkParameterValidity() error { + if opt.Domain != "" && !inList(opt.Domain, BingSearchAcceptedDomainParameters) { + return fmt.Errorf("invalid domain parameter: %s", opt.Domain) + } + + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + return nil +} + +// checkParameterValidity checks validity of bing url parameters. +func (opt *BingUrlOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + type BingSearchOpts struct { Domain oxylabs.Domain StartPage int Pages int Limit int - Locale string + Locale oxylabs.Locale GeoLocation string UserAgent oxylabs.UserAgent CallbackUrl string @@ -29,7 +68,13 @@ func (c *SerpClient) ScrapeBingSearch( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - SetDefaults(opt) + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. err := opt.checkParameterValidity() @@ -66,6 +111,7 @@ func (c *SerpClient) ScrapeBingSearch( type BingUrlOpts struct { UserAgent oxylabs.UserAgent + GeoLocation string Render oxylabs.Render CallbackUrl string } @@ -86,7 +132,9 @@ func (c *SerpClient) ScrapeBingUrl( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - SetDefaults(opt) + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. err = opt.checkParameterValidity() @@ -99,6 +147,7 @@ func (c *SerpClient) ScrapeBingUrl( "source": "bing", "url": url, "user_agent_type": opt.UserAgent, + "geo_location": opt.GeoLocation, "render": opt.Render, "callback_url": opt.CallbackUrl, } diff --git a/serp/defaults.go b/serp/defaults.go index 94afc6b..aa48021 100644 --- a/serp/defaults.go +++ b/serp/defaults.go @@ -1,43 +1,35 @@ package serp import ( - "reflect" - "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// Function to set default values for serp scrapers. -func SetDefaults(opt interface{}) { - val := reflect.ValueOf(opt).Elem() - - // Loop through the fields of the struct. - for i := 0; i < val.NumField(); i++ { - field := val.Field(i) - fieldType := val.Type().Field(i) - - // Set domain. - if fieldType.Name == "Domain" && field.String() == "" { - field.SetString(string(oxylabs.DOMAIN_COM)) - } +func SetDefaultDomain(domain *oxylabs.Domain) { + if *domain == "" { + *domain = oxylabs.DOMAIN_COM + } +} - // Set start page. - if fieldType.Name == "StartPage" && field.Int() == 0 { - field.SetInt(1) - } +func SetDefaultStartPage(startPage *int) { + if *startPage == 0 { + *startPage = 1 + } +} - // Set pages. - if fieldType.Name == "Pages" && field.Int() == 0 { - field.SetInt(1) - } +func SetDefaultPages(pages *int) { + if *pages == 0 { + *pages = 1 + } +} - // Set limit. - if fieldType.Name == "Limit" && field.Int() == 0 { - field.SetInt(10) - } +func SetDefaultLimit(limit *int) { + if *limit == 0 { + *limit = 10 + } +} - // Set user agent. - if fieldType.Name == "UserAgent" && field.String() == "" { - field.SetString(string(oxylabs.UA_DESKTOP)) - } +func SetDefaultUserAgent(userAgent *oxylabs.UserAgent) { + if *userAgent == "" { + *userAgent = oxylabs.UA_DESKTOP } } diff --git a/serp/req.go b/serp/req.go index 5a9c5a6..9280be6 100644 --- a/serp/req.go +++ b/serp/req.go @@ -13,7 +13,8 @@ func (c *SerpClient) Req( jsonPayload []byte, ) (*Response, error) { // Prepare requst. - request, _ := http.NewRequest("POST", + request, _ := http.NewRequest( + "POST", c.BaseUrl, bytes.NewBuffer(jsonPayload), ) diff --git a/serp/valid.go b/serp/valid.go index 82600aa..71bf1fd 100644 --- a/serp/valid.go +++ b/serp/valid.go @@ -1,146 +1,11 @@ package serp -import ( - "fmt" - "reflect" - - "github.com/mslmio/oxylabs-sdk-go/oxylabs" -) - -// Accepted parameters for yandex. -var yandexSearchAcceptedDomainParameters = []oxylabs.Domain{ - oxylabs.DOMAIN_COM, - oxylabs.DOMAIN_RU, - oxylabs.DOMAIN_UA, - oxylabs.DOMAIN_BY, - oxylabs.DOMAIN_KZ, - oxylabs.DOMAIN_TR, -} -var yandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ - oxylabs.LOCALE_EN, - oxylabs.LOCALE_RU, - oxylabs.LOCALE_BY, - oxylabs.LOCALE_DE, - oxylabs.LOCALE_FR, - oxylabs.LOCALE_ID, - oxylabs.LOCALE_KK, - oxylabs.LOCALE_TT, - oxylabs.LOCALE_TR, - oxylabs.LOCALE_UK, -} - -// Function to check validity of yandex search parameters. -func (opt *YandexSearchOpts) checkParameterValidity() error { - - if opt.Domain != "" && !inSlice(opt.Domain, yandexSearchAcceptedDomainParameters) { - return fmt.Errorf("invalid domain parameter: %s", opt.Domain) - } - - if opt.Locale != "" && !inSlice(opt.Locale, yandexSearchAcceptedLocaleParameters) { - return fmt.Errorf("invalid locale parameter: %s", opt.Locale) - } - - if !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - return nil -} - -// Function to check validity of yandex url parameters. -func (opt *YandexUrlOpts) checkParameterValidity() error { - - if !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { - return fmt.Errorf("invalid render parameter: %v", opt.Render) - } - - return nil -} - -// Accepted parameters for bing. -var BingSearchAcceptedDomainParameters = []oxylabs.Domain{ - oxylabs.DOMAIN_COM, - oxylabs.DOMAIN_RU, - oxylabs.DOMAIN_UA, - oxylabs.DOMAIN_BY, - oxylabs.DOMAIN_KZ, - oxylabs.DOMAIN_TR, -} - -// Function to check validity of bing search parameters. -func (opt *BingSearchOpts) checkParameterValidity() error { - - if opt.Domain != "" && !inSlice(opt.Domain, BingSearchAcceptedDomainParameters) { - return fmt.Errorf("invalid domain parameter: %s", opt.Domain) - } - - if !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - if !oxylabs.IsRenderValid(opt.Render) { - return fmt.Errorf("invalid render parameter: %v", opt.Render) - } - return nil -} - -// Function to check validity of bing url parameters. -func (opt *BingUrlOpts) checkParameterValidity() error { - if !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { - return fmt.Errorf("invalid render parameter: %v", opt.Render) - } - - return nil -} - -// Accepted parameters for baidu. -var BaiduSearchAcceptedDomainParameters = []oxylabs.Domain{ - oxylabs.DOMAIN_COM, - oxylabs.DOMAIN_CN, -} - -// Function to check validity of baidu search parameters. -func (opt *BaiduSearchOpts) checkParameterValidity() error { - if opt.Domain != "" && !inSlice(opt.Domain, BaiduSearchAcceptedDomainParameters) { - return fmt.Errorf("invalid domain parameter: %s", opt.Domain) - } - - if !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - return nil -} - -// Function to check validity of baidu url parameters. -func (opt *BaiduUrlOpts) checkParameterValidity() error { - if !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - return nil -} - -// inSlice checks if a value is in the slice. -func inSlice(val interface{}, list interface{}) bool { - switch reflect.TypeOf(list).Kind() { - case reflect.Slice: - s := reflect.ValueOf(list) - - for i := 0; i < s.Len(); i++ { - if s.Index(i).Interface() == val { - return true - } +// Checks if the parameter is in the list of accepted parameters. +func inList[T comparable](val T, list []T) bool { + for _, item := range list { + if item == val { + return true } } - return false } diff --git a/serp/yandex.go b/serp/yandex.go index 32aa571..6f36cb8 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -7,12 +7,64 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// Accepted parameters for yandex. +var yandexSearchAcceptedDomainParameters = []oxylabs.Domain{ + oxylabs.DOMAIN_COM, + oxylabs.DOMAIN_RU, + oxylabs.DOMAIN_UA, + oxylabs.DOMAIN_BY, + oxylabs.DOMAIN_KZ, + oxylabs.DOMAIN_TR, +} +var yandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ + oxylabs.LOCALE_EN, + oxylabs.LOCALE_RU, + oxylabs.LOCALE_BY, + oxylabs.LOCALE_DE, + oxylabs.LOCALE_FR, + oxylabs.LOCALE_ID, + oxylabs.LOCALE_KK, + oxylabs.LOCALE_TT, + oxylabs.LOCALE_TR, + oxylabs.LOCALE_UK, +} + +// checkParameterValidity checks validity of yandex search parameters. +func (opt *YandexSearchOpts) checkParameterValidity() error { + if !inList(opt.Domain, yandexSearchAcceptedDomainParameters) { + return fmt.Errorf("invalid domain parameter: %s", opt.Domain) + } + + if opt.Locale != "" && !inList(opt.Locale, yandexSearchAcceptedLocaleParameters) { + return fmt.Errorf("invalid locale parameter: %s", opt.Locale) + } + + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + return nil +} + +// checkParameterValidity checks validity of yandex url parameters. +func (opt *YandexUrlOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + type YandexSearchOpts struct { Domain oxylabs.Domain StartPage int Pages int Limit int - Locale string + Locale oxylabs.Locale GeoLocation string UserAgent oxylabs.UserAgent CallbackUrl string @@ -28,7 +80,13 @@ func (c *SerpClient) ScrapeYandexSearch( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - SetDefaults(opt) + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. err := opt.checkParameterValidity() @@ -84,7 +142,9 @@ func (c *SerpClient) ScrapeYandexUrl( if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] } - SetDefaults(opt) + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. err = opt.checkParameterValidity() From 37300b9d6ae18090c6bf53d1653d4ef9d8df466d Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Fri, 1 Dec 2023 16:29:12 +0500 Subject: [PATCH 04/27] added google_search + adjusted or multiple return types --- serp/baidu.go | 8 +- serp/bing.go | 8 +- serp/context.go | 55 +++++++++++++ serp/google.go | 195 +++++++++++++++++++++++++++++++++++++++++++++++ serp/req.go | 29 +++++-- serp/response.go | 167 ++++++++++++++++++++++++++++++++++++++-- serp/yandex.go | 8 +- 7 files changed, 447 insertions(+), 23 deletions(-) create mode 100644 serp/context.go create mode 100644 serp/google.go diff --git a/serp/baidu.go b/serp/baidu.go index 5da13ab..7dc0f72 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -48,7 +48,7 @@ type BaiduSearchOpts struct { func (c *SerpClient) ScrapeBaiduSearch( query string, opts ...*BaiduSearchOpts, -) (*Response, error) { +) (interface{}, error) { // Prepare options opt := &BaiduSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -83,7 +83,7 @@ func (c *SerpClient) ScrapeBaiduSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload) + res, err := c.Req(jsonPayload, false) if err != nil { return nil, err } else { @@ -100,7 +100,7 @@ type BaiduUrlOpts struct { func (c *SerpClient) ScrapeBaiduUrl( url string, opts ...*BaiduUrlOpts, -) (*Response, error) { +) (interface{}, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "baidu") if err != nil { @@ -134,7 +134,7 @@ func (c *SerpClient) ScrapeBaiduUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload) + res, err := c.Req(jsonPayload, false) if err != nil { return nil, err } else { diff --git a/serp/bing.go b/serp/bing.go index e660e6e..7fdde1c 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -62,7 +62,7 @@ type BingSearchOpts struct { func (c *SerpClient) ScrapeBingSearch( query string, opts ...*BingSearchOpts, -) (*Response, error) { +) (interface{}, error) { // Prepare options opt := &BingSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -101,7 +101,7 @@ func (c *SerpClient) ScrapeBingSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload) + res, err := c.Req(jsonPayload, false) if err != nil { return nil, err } else { @@ -120,7 +120,7 @@ type BingUrlOpts struct { func (c *SerpClient) ScrapeBingUrl( url string, opts ...*BingUrlOpts, -) (*Response, error) { +) (interface{}, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "bing") if err != nil { @@ -156,7 +156,7 @@ func (c *SerpClient) ScrapeBingUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload) + res, err := c.Req(jsonPayload, false) if err != nil { return nil, err } else { diff --git a/serp/context.go b/serp/context.go new file mode 100644 index 0000000..e0d8519 --- /dev/null +++ b/serp/context.go @@ -0,0 +1,55 @@ +package serp + +type ContextOption map[string]interface{} + +type PageLimit struct { + Page int `json:"page"` + Limit int `json:"limit"` +} + +func LimitPerPage(limits []PageLimit) func(ContextOption) { + return func(ctx ContextOption) { + ctx["limit_per_page"] = limits + } +} +func ResultsLanguage(lang string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["results_language"] = lang + } +} + +func Filter(filter int) func(ContextOption) { + return func(ctx ContextOption) { + ctx["filter"] = filter + } +} + +func Nfpr(nfpr bool) func(ContextOption) { + return func(ctx ContextOption) { + ctx["nfpr"] = nfpr + } +} + +func SafeSearch(safeSearch bool) func(ContextOption) { + return func(ctx ContextOption) { + ctx["safe_search"] = safeSearch + } +} + +func Fpstate(fpstate string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["fpstate"] = fpstate + } +} + +func Tbm(tbm string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["tbm"] = tbm + } +} + +func Tbs(tbs string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["tbs"] = tbs + } +} diff --git a/serp/google.go b/serp/google.go new file mode 100644 index 0000000..c31063a --- /dev/null +++ b/serp/google.go @@ -0,0 +1,195 @@ +package serp + +import ( + "encoding/json" + "fmt" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +var AcceptedTbmParameters = []string{ + "app", + "bks", + "blg", + "dsc", + "isch", + "nws", + "pts", + "plcs", + "rcp", + "lcl", +} + +// checkParameterValidity checks validity of google search parameters. +func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + if opt.Limit <= 0 || opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + + if ctx["tbm"] != nil && !inList(ctx["tbm"].(string), AcceptedTbmParameters) { + return fmt.Errorf("invalid tbm parameter: %v", ctx["tbm"]) + } + + return nil +} + +type GoogleSearchOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + Locale oxylabs.Locale + Geolocation string + UserAgent oxylabs.UserAgent + Render oxylabs.Render + Parse bool + Context []func(ContextOption) +} + +func (c *SerpClient) ScrapeGoogleSearch( + query string, + opts ...*GoogleSearchOpts, +) (interface{}, error) { + // Prepare options. + opt := &GoogleSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Check if limit_per_page context parameter is used together with limit, start_page or pages parameters. + if (opt.Limit != 0 || opt.StartPage != 0 || opt.Pages != 0) && context["limit_per_page"] != nil { + return nil, fmt.Errorf("limit, start_page and pages parameters cannot be used together with limit_per_page context parameter") + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + var payload map[string]interface{} + + // If user sends limit_per_page context parameter, use it instead of limit, start_page and pages parameters. + if context["limit_per_page"] != nil { + payload = map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "geolocation": opt.Geolocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "filter", + "value": context["filter"], + }, + { + "key": "limit_per_page", + "value": context["limit_per_page"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "safe_search", + "value": context["safe_search"], + }, + { + "key": "fpstate", + "value": context["fpstate"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } + } else { + payload = map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "geolocation": opt.Geolocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "filter", + "value": context["filter"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "safe_search", + "value": context["safe_search"], + }, + { + "key": "fpstate", + "value": context["fpstate"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } + } + + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, opt.Parse) + if err != nil { + return nil, err + } else { + return res, nil + } +} diff --git a/serp/req.go b/serp/req.go index 9280be6..f06dce7 100644 --- a/serp/req.go +++ b/serp/req.go @@ -11,7 +11,8 @@ import ( // Request to the API. func (c *SerpClient) Req( jsonPayload []byte, -) (*Response, error) { + parse bool, +) (interface{}, error) { // Prepare requst. request, _ := http.NewRequest( "POST", @@ -33,15 +34,33 @@ func (c *SerpClient) Req( return nil, err } + // Send back error message. + if response.StatusCode != 200 { + return nil, fmt.Errorf("error with status code: %s : %s", response.Status, responseBody) + } + // Unmarshal the JSON object. - resp := &Response{} + var resp interface{} + if parse { + resp = &ParseTrueResponse{} + } else { + resp = &ParseFalseResponse{} + } if err := json.Unmarshal(responseBody, resp); err != nil { return nil, fmt.Errorf("failed to parse JSON object: %v", err) } - // Set status. - resp.StatusCode = response.StatusCode - resp.Status = response.Status + // Use type assertion to check the type and set status fields. + switch r := resp.(type) { + case *ParseTrueResponse: + r.StatusCode = response.StatusCode + r.Status = response.Status + case *ParseFalseResponse: + r.StatusCode = response.StatusCode + r.Status = response.Status + default: + return nil, fmt.Errorf("unexpected response type") + } return resp, nil } diff --git a/serp/response.go b/serp/response.go index 4661027..e34b66d 100644 --- a/serp/response.go +++ b/serp/response.go @@ -1,13 +1,168 @@ package serp -type Response struct { - Results []Result `json:"results"` - Job Job `json:"job"` - StatusCode int `json:"status_code"` - Status string `json:"status"` +type ParseTrueResponse struct { + Results []ResultParseTrue `json:"results"` + Job Job `json:"job"` + StatusCode int `json:"status_code"` + Status string `json:"status"` +} + +type ParseFalseResponse struct { + Results []ResultParseFalse `json:"results"` + Job Job `json:"job"` + StatusCode int `json:"status_code"` + Status string `json:"status"` +} + +type ResultParseTrue struct { + Content Content `json:"content"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` + Page int `json:"page"` + URL string `json:"url"` + JobID string `json:"job_id"` + StatusCode int `json:"status_code"` +} + +type Content struct { + Url string `json:"url"` + Page int `json:"page"` + Result Result `json:"results"` + LastVisiblePage int `json:"last_visible_page"` + ParseStatusCode int `json:"parse_status_code"` } type Result struct { + Pla Pla `json:"pla"` + Paid []Paid `json:"paid"` + Images Images `json:"images"` + Organic []Organic `json:"organic"` + Knowledge Knowledge `json:"knowledge"` + InstantAnswers []InstantAnswers `json:"instant_answers"` + RelatedSearches RelatedSearchesResults `json:"related_searches"` + SearchInformation SearchInformation `json:"search_information"` + TotalResultsCount int `json:"total_results_count"` + LastVisiblePage int `json:"last_visible_page"` + ParseStatusCode int `json:"parse_status_code"` +} +type Pla struct { + Items []PlaItem `json:"items"` + PosOverall *int64 `json:"pos_overall,omitempty"` +} + +type PlaItem struct { + Pos int `json:"pos"` + URL string `json:"url"` + Price string `json:"price"` + Title string `json:"title"` + Seller string `json:"seller"` + URLImage string `json:"url_image"` + ImageData string `json:"image_data"` +} + +type Paid struct { + Position int `json:"pos"` + Url string `json:"url"` + Description string `json:"desc"` + Title string `json:"title"` + DataRw string `json:"data_rw"` + DataPcu []string `json:"data_pcu"` + SiteLinks SiteLinks `json:"sitelinks"` +} + +type SiteLinks struct { + Expanded []Expanded `json:"expanded"` + UrlShown string `json:"url_shown"` + PosOverrall int `json:"pos_overall"` +} + +type Expanded struct { + Url string `json:"url"` + Description string `json:"desc"` + Title string `json:"title"` +} + +type Images struct { + Items []Item `json:"items"` + PosOverrall int `json:"pos_overall"` +} + +type Item struct { + Alt string `json:"alt"` + Pos int `json:"pos"` + Url string `json:"url"` + Source string `json:"source"` +} + +type Organic struct { + Position int `json:"pos"` + Url string `json:"url"` + Title string `json:"title"` + Description string `json:"desc"` + SiteLinks SiteLinksOrganic `json:"sitelinks"` + UrlShown string `json:"url_shown"` + PosOverrall int `json:"pos_overall"` +} + +type SiteLinksOrganic struct { + Expanded []ExpandedOrganic `json:"expanded"` +} + +type ExpandedOrganic struct { + Url string `json:"url"` + Title string `json:"title"` +} + +type Knowledge struct { + Title string `json:"title"` + Images []string `json:"images"` + Factoids []Factoids `json:"factoids"` + Profiles []Profiles `json:"profiles"` + Subtitle string `json:"subtitle"` + Description string `json:"description"` + RelatedSearches []RelatedSearches `json:"related_searches"` +} + +type Factoids struct { + Links []Links `json:"links"` + Title string `json:"title"` + Content string `json:"content"` +} + +type Links struct { + Href string `json:"href"` + Title string `json:"title"` +} + +type Profiles struct { + Url string `json:"url"` + Title string `json:"title"` +} + +type RelatedSearches struct { + Url string `json:"url"` + Title string `json:"title"` + SectionTitle string `json:"section_title"` +} + +type InstantAnswers struct { + Type string `json:"type"` + Parsed bool `json:"_parsed"` + PosOverrall int `json:"pos_overall"` +} + +type RelatedSearchesResults struct { + PosOverrall int `json:"pos_overall"` + RelatedSearches []string `json:"related_searches"` +} + +type SearchInformation struct { + Query string `json:"query"` + ShowingResultsFor string `json:"showing_results_for"` + TotalResultsCount int `json:"total_results_count"` +} + +type ResultParseFalse struct { Content string `json:"content"` CreatedAt string `json:"created_at"` UpdatedAt string `json:"updated_at"` @@ -52,7 +207,7 @@ type Job struct { type Context struct { Key string `json:"key"` - Value string `json:"value"` + Value any `json:"value"` } type Link struct { diff --git a/serp/yandex.go b/serp/yandex.go index 6f36cb8..4e6421e 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -74,7 +74,7 @@ type YandexSearchOpts struct { func (c *SerpClient) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, -) (*Response, error) { +) (interface{}, error) { // Prepare options opt := &YandexSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -112,7 +112,7 @@ func (c *SerpClient) ScrapeYandexSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload) + res, err := c.Req(jsonPayload, false) if err != nil { return nil, err } else { @@ -130,7 +130,7 @@ type YandexUrlOpts struct { func (c *SerpClient) ScrapeYandexUrl( url string, opts ...*YandexUrlOpts, -) (*Response, error) { +) (interface{}, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "yandex") if err != nil { @@ -165,7 +165,7 @@ func (c *SerpClient) ScrapeYandexUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload) + res, err := c.Req(jsonPayload, false) if err != nil { return nil, err } else { From 6e6650817e7d1af4a80b33064fb1f87a8f76d0bf Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Fri, 1 Dec 2023 17:14:17 +0500 Subject: [PATCH 05/27] added google source and check for empty url --- oxylabs/common.go | 5 ++++ serp/google.go | 71 +++++++++++++++++++++++++++++++++++++++++++++++ serp/yandex.go | 1 - 3 files changed, 76 insertions(+), 1 deletion(-) diff --git a/oxylabs/common.go b/oxylabs/common.go index 10b7965..9abd9bf 100644 --- a/oxylabs/common.go +++ b/oxylabs/common.go @@ -10,6 +10,11 @@ func ValidateURL( inputURL string, host string, ) error { + // Check if url is empty. + if inputURL == "" { + return fmt.Errorf("url parameter is empty") + } + // Parse the URL parsedURL, err := url.ParseRequestURI(inputURL) if err != nil { diff --git a/serp/google.go b/serp/google.go index c31063a..23e01b9 100644 --- a/serp/google.go +++ b/serp/google.go @@ -41,6 +41,19 @@ func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { return nil } +// checkParameterValidity checks validity of google search parameters. +func (opt *GoogleUrlOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + type GoogleSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -54,6 +67,7 @@ type GoogleSearchOpts struct { Context []func(ContextOption) } +// Scrapes Google via its search engine. func (c *SerpClient) ScrapeGoogleSearch( query string, opts ...*GoogleSearchOpts, @@ -193,3 +207,60 @@ func (c *SerpClient) ScrapeGoogleSearch( return res, nil } } + +type GoogleUrlOpts struct { + GeoLocation string + UserAgent oxylabs.UserAgent + Render oxylabs.Render + Parse bool + CallbackUrl string +} + +// Scrapes Google via provided url. +func (c *SerpClient) ScrapeGoogleUrl( + url string, + opts ...*GoogleUrlOpts, +) (interface{}, error) { + // Check validity of url. + err := oxylabs.ValidateURL(url, "google") + if err != nil { + return nil, err + } + + // Prepare options. + opt := &GoogleUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google", + "url": url, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + "geo_location": opt.GeoLocation, + "parse": opt.Parse, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, opt.Parse) + if err != nil { + return nil, err + } else { + return res, nil + } +} diff --git a/serp/yandex.go b/serp/yandex.go index 4e6421e..b6a931c 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -171,5 +171,4 @@ func (c *SerpClient) ScrapeYandexUrl( } else { return res, nil } - } From fb3c8402a65a466e1168ece1be88e33727ae9417 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Thu, 7 Dec 2023 11:57:19 +0500 Subject: [PATCH 06/27] Apply suggestions from code review v2 --- oxylabs/common.go | 4 +- serp/valid.go => oxylabs/utils.go | 4 +- serp/baidu.go | 17 +- serp/bing.go | 18 +- serp/context.go | 1 + serp/google.go | 22 +- serp/req.go | 32 +-- serp/response.go | 382 ++++++++++++++++++------------ serp/yandex.go | 20 +- 9 files changed, 278 insertions(+), 222 deletions(-) rename serp/valid.go => oxylabs/utils.go (69%) diff --git a/oxylabs/common.go b/oxylabs/common.go index 9abd9bf..9735207 100644 --- a/oxylabs/common.go +++ b/oxylabs/common.go @@ -23,12 +23,12 @@ func ValidateURL( // Check if the scheme (protocol) is present and non-empty. if parsedURL.Scheme == "" { - return fmt.Errorf("url is missing scheme.") + return fmt.Errorf("url is missing scheme") } // Check if the Host is present and non-empty. if parsedURL.Host == "" { - return fmt.Errorf("url is missing a host.") + return fmt.Errorf("url is missing a host") } // Check if the Host matches the expected domain/host. diff --git a/serp/valid.go b/oxylabs/utils.go similarity index 69% rename from serp/valid.go rename to oxylabs/utils.go index 71bf1fd..08e5662 100644 --- a/serp/valid.go +++ b/oxylabs/utils.go @@ -1,7 +1,7 @@ -package serp +package oxylabs // Checks if the parameter is in the list of accepted parameters. -func inList[T comparable](val T, list []T) bool { +func InList[T comparable](val T, list []T) bool { for _, item := range list { if item == val { return true diff --git a/serp/baidu.go b/serp/baidu.go index 7dc0f72..a22cf38 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -15,7 +15,7 @@ var BaiduSearchAcceptedDomainParameters = []oxylabs.Domain{ // checkParameterValidity checks validity of baidu search parameters. func (opt *BaiduSearchOpts) checkParameterValidity() error { - if !inList(opt.Domain, BaiduSearchAcceptedDomainParameters) { + if !oxylabs.InList(opt.Domain, BaiduSearchAcceptedDomainParameters) { return fmt.Errorf("invalid domain parameter: %s", opt.Domain) } @@ -48,7 +48,7 @@ type BaiduSearchOpts struct { func (c *SerpClient) ScrapeBaiduSearch( query string, opts ...*BaiduSearchOpts, -) (interface{}, error) { +) (*Response, error) { // Prepare options opt := &BaiduSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -83,12 +83,12 @@ func (c *SerpClient) ScrapeBaiduSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, false) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } type BaiduUrlOpts struct { @@ -100,7 +100,7 @@ type BaiduUrlOpts struct { func (c *SerpClient) ScrapeBaiduUrl( url string, opts ...*BaiduUrlOpts, -) (interface{}, error) { +) (*Response, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "baidu") if err != nil { @@ -134,11 +134,10 @@ func (c *SerpClient) ScrapeBaiduUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, false) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err - } else { - return res, nil } + return res, nil } diff --git a/serp/bing.go b/serp/bing.go index 7fdde1c..f454f01 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -19,7 +19,7 @@ var BingSearchAcceptedDomainParameters = []oxylabs.Domain{ // checkParameterValidity checks validity of bing search parameters. func (opt *BingSearchOpts) checkParameterValidity() error { - if opt.Domain != "" && !inList(opt.Domain, BingSearchAcceptedDomainParameters) { + if opt.Domain != "" && !oxylabs.InList(opt.Domain, BingSearchAcceptedDomainParameters) { return fmt.Errorf("invalid domain parameter: %s", opt.Domain) } @@ -62,7 +62,7 @@ type BingSearchOpts struct { func (c *SerpClient) ScrapeBingSearch( query string, opts ...*BingSearchOpts, -) (interface{}, error) { +) (*Response, error) { // Prepare options opt := &BingSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -101,12 +101,12 @@ func (c *SerpClient) ScrapeBingSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, false) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } type BingUrlOpts struct { @@ -120,7 +120,7 @@ type BingUrlOpts struct { func (c *SerpClient) ScrapeBingUrl( url string, opts ...*BingUrlOpts, -) (interface{}, error) { +) (*Response, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "bing") if err != nil { @@ -156,10 +156,10 @@ func (c *SerpClient) ScrapeBingUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, false) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } diff --git a/serp/context.go b/serp/context.go index e0d8519..40a0b40 100644 --- a/serp/context.go +++ b/serp/context.go @@ -12,6 +12,7 @@ func LimitPerPage(limits []PageLimit) func(ContextOption) { ctx["limit_per_page"] = limits } } + func ResultsLanguage(lang string) func(ContextOption) { return func(ctx ContextOption) { ctx["results_language"] = lang diff --git a/serp/google.go b/serp/google.go index 23e01b9..24d45a5 100644 --- a/serp/google.go +++ b/serp/google.go @@ -34,7 +34,7 @@ func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") } - if ctx["tbm"] != nil && !inList(ctx["tbm"].(string), AcceptedTbmParameters) { + if ctx["tbm"] != nil && !oxylabs.InList(ctx["tbm"].(string), AcceptedTbmParameters) { return fmt.Errorf("invalid tbm parameter: %v", ctx["tbm"]) } @@ -71,7 +71,7 @@ type GoogleSearchOpts struct { func (c *SerpClient) ScrapeGoogleSearch( query string, opts ...*GoogleSearchOpts, -) (interface{}, error) { +) (*Response, error) { // Prepare options. opt := &GoogleSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -111,7 +111,7 @@ func (c *SerpClient) ScrapeGoogleSearch( "source": "google_search", "domain": opt.Domain, "query": query, - "geolocation": opt.Geolocation, + "geo_location": opt.Geolocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -158,7 +158,7 @@ func (c *SerpClient) ScrapeGoogleSearch( "start_page": opt.StartPage, "pages": opt.Pages, "limit": opt.Limit, - "geolocation": opt.Geolocation, + "geo_location": opt.Geolocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -200,12 +200,12 @@ func (c *SerpClient) ScrapeGoogleSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, opt.Parse) + res, err := c.Req(jsonPayload, opt.Parse, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } type GoogleUrlOpts struct { @@ -220,7 +220,7 @@ type GoogleUrlOpts struct { func (c *SerpClient) ScrapeGoogleUrl( url string, opts ...*GoogleUrlOpts, -) (interface{}, error) { +) (*Response, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "google") if err != nil { @@ -257,10 +257,10 @@ func (c *SerpClient) ScrapeGoogleUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, opt.Parse) + res, err := c.Req(jsonPayload, opt.Parse, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } diff --git a/serp/req.go b/serp/req.go index f06dce7..26e9d7c 100644 --- a/serp/req.go +++ b/serp/req.go @@ -2,7 +2,6 @@ package serp import ( "bytes" - "encoding/json" "fmt" "io" "net/http" @@ -12,10 +11,11 @@ import ( func (c *SerpClient) Req( jsonPayload []byte, parse bool, -) (interface{}, error) { + method string, +) (*Response, error) { // Prepare requst. request, _ := http.NewRequest( - "POST", + method, c.BaseUrl, bytes.NewBuffer(jsonPayload), ) @@ -36,31 +36,19 @@ func (c *SerpClient) Req( // Send back error message. if response.StatusCode != 200 { - return nil, fmt.Errorf("error with status code: %s : %s", response.Status, responseBody) + return nil, fmt.Errorf("error with status code %s: %s", response.Status, responseBody) } // Unmarshal the JSON object. - var resp interface{} - if parse { - resp = &ParseTrueResponse{} - } else { - resp = &ParseFalseResponse{} - } - if err := json.Unmarshal(responseBody, resp); err != nil { + resp := &Response{} + resp.Parse = parse + if err := resp.UnmarshalJSON(responseBody); err != nil { return nil, fmt.Errorf("failed to parse JSON object: %v", err) } - // Use type assertion to check the type and set status fields. - switch r := resp.(type) { - case *ParseTrueResponse: - r.StatusCode = response.StatusCode - r.Status = response.Status - case *ParseFalseResponse: - r.StatusCode = response.StatusCode - r.Status = response.Status - default: - return nil, fmt.Errorf("unexpected response type") - } + // Set status code and status. + resp.StatusCode = response.StatusCode + resp.Status = response.Status return resp, nil } diff --git a/serp/response.go b/serp/response.go index e34b66d..fe8fb18 100644 --- a/serp/response.go +++ b/serp/response.go @@ -1,53 +1,175 @@ package serp -type ParseTrueResponse struct { - Results []ResultParseTrue `json:"results"` - Job Job `json:"job"` - StatusCode int `json:"status_code"` - Status string `json:"status"` +import ( + "encoding/json" +) + +// Custom function to unmarshall into the Response struct because of +// different return types depending on the parse option. +func (r *Response) UnmarshalJSON(data []byte) error { + // Unmarshal json data into RawResponse map. + var rawResponse map[string]json.RawMessage + if err := json.Unmarshal(data, &rawResponse); err != nil { + return err + } + + // Unmarshal the results array. + if resultsData, ok := rawResponse["results"]; ok { + // Slice to store raw JSON messages for each result. + var resultsRawMessages []json.RawMessage + if err := json.Unmarshal(resultsData, &resultsRawMessages); err != nil { + return err + } + + // Unmarshal each result into the Results slice. + for _, resultRawMessage := range resultsRawMessages { + if r.Parse { + var result struct { + ContentParsed Content `json:"content"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` + Page int `json:"page"` + URL string `json:"url"` + JobID string `json:"job_id"` + StatusCode int `json:"status_code"` + } + if err := json.Unmarshal(resultRawMessage, &result); err != nil { + return err + } + r.Results = append(r.Results, Result{ + ContentParsed: result.ContentParsed, + CreatedAt: result.CreatedAt, + UpdatedAt: result.UpdatedAt, + Page: result.Page, + URL: result.URL, + JobID: result.JobID, + StatusCode: result.StatusCode, + }) + } else { + var result struct { + Content string `json:"content"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` + Page int `json:"page"` + URL string `json:"url"` + JobID string `json:"job_id"` + StatusCode int `json:"status_code"` + } + if err := json.Unmarshal(resultRawMessage, &result); err != nil { + return err + } + r.Results = append(r.Results, Result{ + Content: result.Content, + CreatedAt: result.CreatedAt, + UpdatedAt: result.UpdatedAt, + Page: result.Page, + URL: result.URL, + JobID: result.JobID, + StatusCode: result.StatusCode, + }) + } + } + } + + // Unmarshal the job object. + if jobData, ok := rawResponse["job"]; ok { + var job Job + if err := json.Unmarshal(jobData, &job); err != nil { + return err + } + r.Job = job + } + + return nil } -type ParseFalseResponse struct { - Results []ResultParseFalse `json:"results"` - Job Job `json:"job"` - StatusCode int `json:"status_code"` - Status string `json:"status"` +type Response struct { + Parse bool `json:"parse"` + Results []Result `json:"results"` + Job Job `json:"job"` + StatusCode int `json:"status_code"` + Status string `json:"status"` } -type ResultParseTrue struct { - Content Content `json:"content"` - CreatedAt string `json:"created_at"` - UpdatedAt string `json:"updated_at"` - Page int `json:"page"` - URL string `json:"url"` - JobID string `json:"job_id"` - StatusCode int `json:"status_code"` +type Job struct { + CallbackURL string `json:"callback_url"` + ClientID int `json:"client_id"` + Context []Context `json:"context"` + CreatedAt string `json:"created_at"` + Domain string `json:"domain"` + GeoLocation interface{} `json:"geo_location"` + ID string `json:"id"` + Limit int `json:"limit"` + Locale interface{} `json:"locale"` + Pages int `json:"pages"` + Parse bool `json:"parse"` + ParserType interface{} `json:"parser_type"` + ParsingInstructions interface{} `json:"parsing_instructions"` + BrowserInstructions interface{} `json:"browser_instructions"` + Render interface{} `json:"render"` + URL interface{} `json:"url"` + Query string `json:"query"` + Source string `json:"source"` + StartPage int `json:"start_page"` + Status string `json:"status"` + StorageType interface{} `json:"storage_type"` + StorageURL interface{} `json:"storage_url"` + Subdomain string `json:"subdomain"` + ContentEncoding string `json:"content_encoding"` + UpdatedAt string `json:"updated_at"` + UserAgentType string `json:"user_agent_type"` + SessionInfo interface{} `json:"session_info"` + Statuses []interface{} `json:"statuses"` + ClientNotes interface{} `json:"client_notes"` + Links []Link `json:"_links"` } -type Content struct { - Url string `json:"url"` - Page int `json:"page"` - Result Result `json:"results"` - LastVisiblePage int `json:"last_visible_page"` - ParseStatusCode int `json:"parse_status_code"` +type Context struct { + Key string `json:"key"` + Value interface{} `json:"value"` +} + +type Link struct { + Rel string `json:"rel"` + Href string `json:"href"` + Method string `json:"method"` } type Result struct { - Pla Pla `json:"pla"` - Paid []Paid `json:"paid"` - Images Images `json:"images"` - Organic []Organic `json:"organic"` - Knowledge Knowledge `json:"knowledge"` - InstantAnswers []InstantAnswers `json:"instant_answers"` - RelatedSearches RelatedSearchesResults `json:"related_searches"` - SearchInformation SearchInformation `json:"search_information"` - TotalResultsCount int `json:"total_results_count"` - LastVisiblePage int `json:"last_visible_page"` - ParseStatusCode int `json:"parse_status_code"` + ContentParsed Content + Content string + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` + Page int `json:"page"` + URL string `json:"url"` + JobID string `json:"job_id"` + StatusCode int `json:"status_code"` + ParserType string `json:"parser_type"` +} + +type Content struct { + URL string `json:"url"` + Page int `json:"page"` + Results Results `json:"results"` + LastVisiblePage int `json:"last_visible_page"` + ParseStatusCode int `json:"parse_status_code"` +} + +type Results struct { + Pla Pla `json:"pla"` + Paid []Paid `json:"paid"` + Organic []Organic `json:"organic"` + Knowledge Knowledge `json:"knowledge"` + LocalPack LocalPack `json:"local_pack"` + InstantAnswers []InstantAnswer `json:"instant_answers"` + RelatedSearches RelatedSearches `json:"related_searches"` + SearchInformation SearchInformation `json:"search_information"` + TotalResultsCount int `json:"total_results_count"` } + type Pla struct { Items []PlaItem `json:"items"` - PosOverall *int64 `json:"pos_overall,omitempty"` + PosOverall int `json:"pos_overall,omitempty"` } type PlaItem struct { @@ -60,99 +182,99 @@ type PlaItem struct { ImageData string `json:"image_data"` } -type Paid struct { - Position int `json:"pos"` - Url string `json:"url"` - Description string `json:"desc"` - Title string `json:"title"` - DataRw string `json:"data_rw"` - DataPcu []string `json:"data_pcu"` - SiteLinks SiteLinks `json:"sitelinks"` -} - -type SiteLinks struct { - Expanded []Expanded `json:"expanded"` - UrlShown string `json:"url_shown"` - PosOverrall int `json:"pos_overall"` +type InstantAnswer struct { + Type string `json:"type"` + Parsed bool `json:"_parsed"` + PosOverall int `json:"pos_overall"` } -type Expanded struct { - Url string `json:"url"` - Description string `json:"desc"` - Title string `json:"title"` +type Knowledge struct { + Title string `json:"title"` + Images []string `json:"images"` + Factoids []Factoid `json:"factoids"` + Profiles []Profile `json:"profiles"` + Subtitle string `json:"subtitle"` + Description string `json:"description"` + RelatedSearches []RelatedSearch `json:"related_searches"` } -type Images struct { - Items []Item `json:"items"` - PosOverrall int `json:"pos_overall"` +type Factoid struct { + Links []LinkElement `json:"links"` + Title string `json:"title"` + Content string `json:"content"` } -type Item struct { - Alt string `json:"alt"` - Pos int `json:"pos"` - Url string `json:"url"` - Source string `json:"source"` +type LinkElement struct { + Href string `json:"href"` + Title string `json:"title"` } -type Organic struct { - Position int `json:"pos"` - Url string `json:"url"` - Title string `json:"title"` - Description string `json:"desc"` - SiteLinks SiteLinksOrganic `json:"sitelinks"` - UrlShown string `json:"url_shown"` - PosOverrall int `json:"pos_overall"` +type Profile struct { + URL string `json:"url"` + Title string `json:"title"` } -type SiteLinksOrganic struct { - Expanded []ExpandedOrganic `json:"expanded"` +type RelatedSearch struct { + URL string `json:"url"` + Title string `json:"title"` + SectionTitle string `json:"section_title"` } -type ExpandedOrganic struct { - Url string `json:"url"` - Title string `json:"title"` +type LocalPack struct { + Items []Item `json:"items"` + PosOverall int `json:"pos_overall"` } -type Knowledge struct { - Title string `json:"title"` - Images []string `json:"images"` - Factoids []Factoids `json:"factoids"` - Profiles []Profiles `json:"profiles"` - Subtitle string `json:"subtitle"` - Description string `json:"description"` - RelatedSearches []RelatedSearches `json:"related_searches"` +type Item struct { + Cid string `json:"cid"` + Pos int `json:"pos"` + Links []LinkElement `json:"links"` + Title string `json:"title"` + Rating int `json:"rating"` + Address string `json:"address"` + RatingCount int `json:"rating_count"` } -type Factoids struct { - Links []Links `json:"links"` - Title string `json:"title"` - Content string `json:"content"` +type Organic struct { + Pos int `json:"pos"` + URL string `json:"url"` + Desc string `json:"desc"` + Title string `json:"title"` + Sitelinks OrganicSitelinks `json:"sitelinks,omitempty"` + URLShown string `json:"url_shown"` + PosOverall int `json:"pos_overall"` } -type Links struct { - Href string `json:"href"` - Title string `json:"title"` +type OrganicSitelinks struct { + Expanded []Profile `json:"expanded,omitempty"` + Inline []Profile `json:"inline,omitempty"` } -type Profiles struct { - Url string `json:"url"` - Title string `json:"title"` +type Paid struct { + Pos int `json:"pos"` + URL string `json:"url"` + Desc string `json:"desc"` + Title string `json:"title"` + DataRw string `json:"data_rw"` + DataPcu []string `json:"data_pcu"` + Sitelinks PaidSitelinks `json:"sitelinks,omitempty"` + URLShown string `json:"url_shown"` + PosOverall int `json:"pos_overall"` } -type RelatedSearches struct { - Url string `json:"url"` - Title string `json:"title"` - SectionTitle string `json:"section_title"` +type PaidSitelinks struct { + Expanded []Expanded `json:"expanded,omitempty"` + Inline []Profile `json:"inline,omitempty"` } -type InstantAnswers struct { - Type string `json:"type"` - Parsed bool `json:"_parsed"` - PosOverrall int `json:"pos_overall"` +type Expanded struct { + URL string `json:"url"` + Desc string `json:"desc"` + Title string `json:"title"` } -type RelatedSearchesResults struct { - PosOverrall int `json:"pos_overall"` +type RelatedSearches struct { + PosOverall int `json:"pos_overall"` RelatedSearches []string `json:"related_searches"` } @@ -161,57 +283,3 @@ type SearchInformation struct { ShowingResultsFor string `json:"showing_results_for"` TotalResultsCount int `json:"total_results_count"` } - -type ResultParseFalse struct { - Content string `json:"content"` - CreatedAt string `json:"created_at"` - UpdatedAt string `json:"updated_at"` - Page int `json:"page"` - URL string `json:"url"` - JobID string `json:"job_id"` - StatusCode int `json:"status_code"` -} - -type Job struct { - CallbackURL string `json:"callback_url"` - ClientID int `json:"client_id"` - Context []Context `json:"context"` - CreatedAt string `json:"created_at"` - Domain string `json:"domain"` - GeoLocation string `json:"geo_location"` - ID string `json:"id"` - Limit int `json:"limit"` - Locale string `json:"locale"` - Pages int `json:"pages"` - Parse bool `json:"parse"` - ParserType string `json:"parser_type"` - ParsingInstructions string `json:"parsing_instructions"` - BrowserInstructions string `json:"browser_instructions"` - Render string `json:"render"` - URL string `json:"url"` - Query string `json:"query"` - Source string `json:"source"` - StartPage int `json:"start_page"` - Status string `json:"status"` - StorageType string `json:"storage_type"` - StorageURL string `json:"storage_url"` - Subdomain string `json:"subdomain"` - ContentEncoding string `json:"content_encoding"` - UpdatedAt string `json:"updated_at"` - UserAgentType string `json:"user_agent_type"` - SessionInfo string `json:"session_info"` - Statuses []string `json:"statuses"` - ClientNotes string `json:"client_notes"` - Links []Link `json:"_links"` -} - -type Context struct { - Key string `json:"key"` - Value any `json:"value"` -} - -type Link struct { - Rel string `json:"rel"` - Href string `json:"href"` - Method string `json:"method"` -} diff --git a/serp/yandex.go b/serp/yandex.go index b6a931c..84f7147 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -31,11 +31,11 @@ var yandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ // checkParameterValidity checks validity of yandex search parameters. func (opt *YandexSearchOpts) checkParameterValidity() error { - if !inList(opt.Domain, yandexSearchAcceptedDomainParameters) { + if !oxylabs.InList(opt.Domain, yandexSearchAcceptedDomainParameters) { return fmt.Errorf("invalid domain parameter: %s", opt.Domain) } - if opt.Locale != "" && !inList(opt.Locale, yandexSearchAcceptedLocaleParameters) { + if opt.Locale != "" && !oxylabs.InList(opt.Locale, yandexSearchAcceptedLocaleParameters) { return fmt.Errorf("invalid locale parameter: %s", opt.Locale) } @@ -74,7 +74,7 @@ type YandexSearchOpts struct { func (c *SerpClient) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, -) (interface{}, error) { +) (*Response, error) { // Prepare options opt := &YandexSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -112,12 +112,12 @@ func (c *SerpClient) ScrapeYandexSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, false) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } type YandexUrlOpts struct { @@ -130,7 +130,7 @@ type YandexUrlOpts struct { func (c *SerpClient) ScrapeYandexUrl( url string, opts ...*YandexUrlOpts, -) (interface{}, error) { +) (*Response, error) { // Check validity of url. err := oxylabs.ValidateURL(url, "yandex") if err != nil { @@ -165,10 +165,10 @@ func (c *SerpClient) ScrapeYandexUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - res, err := c.Req(jsonPayload, false) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err - } else { - return res, nil } + + return res, nil } From e72bf48d8b4e07cddff06cc5806cdd5500b58022 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Thu, 7 Dec 2023 18:35:07 +0500 Subject: [PATCH 07/27] added remaining google serp sources --- serp/context.go | 42 ++++ serp/google.go | 545 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 584 insertions(+), 3 deletions(-) diff --git a/serp/context.go b/serp/context.go index 40a0b40..5df932e 100644 --- a/serp/context.go +++ b/serp/context.go @@ -54,3 +54,45 @@ func Tbs(tbs string) func(ContextOption) { ctx["tbs"] = tbs } } + +func HotelOccupancy(num int) func(ContextOption) { + return func(ctx ContextOption) { + ctx["hotel_occupancy"] = num + } +} + +func HotelDates(dates string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["hotel_dates"] = dates + } +} + +func HotelClasses(classes []int) func(ContextOption) { + return func(ctx ContextOption) { + ctx["hotel_classes"] = classes + } +} + +func SearchType(searchType string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["search_type"] = searchType + } +} + +func DateFrom(dateFrom string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["date_from"] = dateFrom + } +} + +func DateTo(dateTo string) func(ContextOption) { + return func(ctx ContextOption) { + ctx["date_to"] = dateTo + } +} + +func CategoryId(categoryId int) func(ContextOption) { + return func(ctx ContextOption) { + ctx["category_id"] = categoryId + } +} diff --git a/serp/google.go b/serp/google.go index 24d45a5..000b1b7 100644 --- a/serp/google.go +++ b/serp/google.go @@ -20,6 +20,13 @@ var AcceptedTbmParameters = []string{ "lcl", } +var AcceptedSearchTypeParameters = []string{ + "web_search", + "image_search", + "google_shopping", + "youtube_search", +} + // checkParameterValidity checks validity of google search parameters. func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { @@ -41,7 +48,7 @@ func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { return nil } -// checkParameterValidity checks validity of google search parameters. +// checkParameterValidity checks validity of google url parameters. func (opt *GoogleUrlOpts) checkParameterValidity() error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -54,6 +61,85 @@ func (opt *GoogleUrlOpts) checkParameterValidity() error { return nil } +// checkParameterValidity checks validity of google ads parameters. +func (opt *GoogleAdsOpts) checkParameterValidity(ctx ContextOption) error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + if opt.Limit <= 0 || opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + + if ctx["tbm"] != nil && !oxylabs.InList(ctx["tbm"].(string), AcceptedTbmParameters) { + return fmt.Errorf("invalid tbm parameter: %v", ctx["tbm"]) + } + + return nil +} + +// checkParameterValidity checks validity of google suggestions parameters. +func (opt *GoogleSuggestionsOpts) checkParameterValidity() error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + +// checkParameterValidity checks validity of google hotels parameters. +func (opt *GoogleHotelsOpts) checkParameterValidity(ctx ContextOption) error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + if opt.Limit <= 0 || opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + + return nil +} + +// checkParameterValidity checks validity of google travel hotels parameters. +func (opt *GoogleTravelHotelsOpts) checkParameterValidity(ctx ContextOption) error { + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + if opt.Limit <= 0 || opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + + return nil +} + +// checkParameterValidity checks validity of google trends explore parameters. +func (opt *GoogleTrendsExploreOpts) checkParameterValidity(ctx ContextOption) error { + + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if ctx["search_type"] != nil && !oxylabs.InList(ctx["search_type"].(string), AcceptedSearchTypeParameters) { + return fmt.Errorf("invalid search_type parameter: %v", ctx["search_type"]) + } + + return nil +} + type GoogleSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -78,7 +164,7 @@ func (c *SerpClient) ScrapeGoogleSearch( opt = opts[len(opts)-1] } - // Initialize the context map apply each provided context modifier function. + // Initialize the context map and apply each provided context modifier function. context := make(ContextOption) for _, modifier := range opt.Context { modifier(context) @@ -216,7 +302,7 @@ type GoogleUrlOpts struct { CallbackUrl string } -// Scrapes Google via provided url. +// ScrapeGoogleUrl scrapes google vith google as source. func (c *SerpClient) ScrapeGoogleUrl( url string, opts ...*GoogleUrlOpts, @@ -264,3 +350,456 @@ func (c *SerpClient) ScrapeGoogleUrl( return res, nil } + +// checkParameterValidity checks validity of google images parameters. +func (opt *GoogleImagesOpts) checkParameterValidity(ctx ContextOption) error { + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + if opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + + return nil +} + +type GoogleAdsOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + Locale string + GeoLocation string + UserAgent oxylabs.UserAgent + Render oxylabs.Render + Parse bool + Context []func(ContextOption) +} + +// SrcapeGoogleAds scrapes google via the google_ads source. +func (c *SerpClient) ScrapeGoogleAds( + query string, + opts ...*GoogleAdsOpts, +) (*Response, error) { + // Prepare options. + opt := &GoogleAdsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + payload := map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, opt.Parse, "POST") + if err != nil { + return nil, err + } + + return res, nil +} + +type GoogleSuggestionsOpts struct { + Locale string + GeoLocation string + UserAgent oxylabs.UserAgent + Render oxylabs.Render + CallbackUrl string +} + +// ScrapeGoogleSuggestions scrapes google via the google_suggestions source. +func (c *SerpClient) ScrapeGoogleSuggestions( + query string, + opts ...*GoogleSuggestionsOpts, +) (*Response, error) { + // Prepare options. + opt := &GoogleSuggestionsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_suggestions", + "query": query, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, false, "POST") + if err != nil { + return nil, err + } + + return res, nil +} + +type GoogleHotelsOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + Locale string + ResultsLanguage string + GeoLocation string + UserAgent oxylabs.UserAgent + Render oxylabs.Render + CallbackURL string + Context []func(ContextOption) +} + +// ScrapeGoogleHotels scrapes google via the google_hotels source. +func (c *SerpClient) ScrapeGoogleHotels( + query string, + opts ...*GoogleHotelsOpts, +) (*Response, error) { + // Prepare options. + opt := &GoogleHotelsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_hotels", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "results_language": opt.ResultsLanguage, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackURL, + "context": []map[string]interface{}{ + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "hotel_occupancy", + "value": context["hotel_occupancy"], + }, + { + "key": "hotel_dates", + "value": context["hotel_dates"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, false, "POST") + if err != nil { + return nil, err + } + + return res, nil +} + +type GoogleTravelHotelsOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Limit int + Locale string + GeoLocation string + Render oxylabs.Render + CallbackURL string + Context []func(ContextOption) +} + +// ScrapeGoogleTravelHotels scrapes google via the google_travel_hotels source. +func (c *SerpClient) ScrapeGoogleTravelHotels( + query string, + opts ...*GoogleTravelHotelsOpts, +) (*Response, error) { + // Prepare options. + opt := &GoogleTravelHotelsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_travel_hotels", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "render": opt.Render, + "callback_url": opt.CallbackURL, + "context": []map[string]interface{}{ + { + "key": "hotel_occupancy", + "value": context["hotel_occupancy"], + }, + { + "key": "hotel_classes", + "value": context["hotel_classes"], + }, + { + "key": "hotel_dates", + "value": context["hotel_dates"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, false, "POST") + if err != nil { + return nil, err + } + + return res, nil +} + +type GoogleImagesOpts struct { + Domain oxylabs.Domain + StartPage int + Pages int + Locale string + GeoLocation string + UserAgent oxylabs.UserAgent + Render oxylabs.Render + CallbackURL string + Context []func(ContextOption) +} + +// ScrapeGoogleImages scrapes google via the google_images source. +func (c *SerpClient) ScrapeGoogleImages( + query string, + opts ...*GoogleImagesOpts, +) (*Response, error) { + // Prepare options. + opt := &GoogleImagesOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultPages(&opt.Pages) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_travel_hotels", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackURL, + "context": []map[string]interface{}{ + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "results_language", + "value": context["results_language"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + res, err := c.Req(jsonPayload, false, "POST") + if err != nil { + return nil, err + } + + return res, nil +} + +type GoogleTrendsExploreOpts struct { + GeoLocation string + Context []func(ContextOption) + UserAgent oxylabs.UserAgent + CallbackURL string +} + +// ScrapeGoogleTrendsExplore scrapes google via the google_trends_explore source. +func (c *SerpClient) ScrapeGoogleTrendsExplore( + query string, + opts ...*GoogleTrendsExploreOpts, +) (*Response, error) { + // Prepare options. + opt := &GoogleTrendsExploreOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_trends_explore", + "query": query, + "geo_location": opt.GeoLocation, + "context": []map[string]interface{}{ + { + "key": "search_type", + "value": context["search_type"], + }, + { + "key": "date_from", + "value": context["date_from"], + }, + { + "key": "date_to", + "value": context["date_to"], + }, + { + "key": "category_id", + "value": context["category_id"], + }, + }, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackURL, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + fmt.Printf("%+v\n\n", payload) + res, err := c.Req(jsonPayload, false, "POST") + if err != nil { + return nil, err + } + + return res, nil +} From 2ffd8e4d8ddb07ce8c44e744fde182d0e8441d3c Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Fri, 8 Dec 2023 14:18:41 +0500 Subject: [PATCH 08/27] comments + some more checks --- serp/baidu.go | 4 ++-- serp/bing.go | 4 ++-- serp/google.go | 59 +++++++++++++++++++++++++++++++++++++++----------- serp/yandex.go | 4 ++-- 4 files changed, 52 insertions(+), 19 deletions(-) diff --git a/serp/baidu.go b/serp/baidu.go index a22cf38..26faf10 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -44,7 +44,7 @@ type BaiduSearchOpts struct { CallbackUrl string } -// Scrapes Baidu via its search engine. +// ScrapeBaiduSearch scrapes baidu with baidu_search as source. func (c *SerpClient) ScrapeBaiduSearch( query string, opts ...*BaiduSearchOpts, @@ -96,7 +96,7 @@ type BaiduUrlOpts struct { CallbackUrl string } -// Scrapes Baidu via its url. +// ScrapeBaiduUrl scrapes baidu with baidu as source. func (c *SerpClient) ScrapeBaiduUrl( url string, opts ...*BaiduUrlOpts, diff --git a/serp/bing.go b/serp/bing.go index f454f01..d6153b1 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -58,7 +58,7 @@ type BingSearchOpts struct { Render oxylabs.Render } -// Scrapes Bing via its search engine. +// ScraperBingSearch scrapes bing with bing_search as source. func (c *SerpClient) ScrapeBingSearch( query string, opts ...*BingSearchOpts, @@ -116,7 +116,7 @@ type BingUrlOpts struct { CallbackUrl string } -// Scrapes Bing via provided url. +// ScrapeBingUrl scrapes bing with bing as source. func (c *SerpClient) ScrapeBingUrl( url string, opts ...*BingUrlOpts, diff --git a/serp/google.go b/serp/google.go index 000b1b7..fbdf29d 100644 --- a/serp/google.go +++ b/serp/google.go @@ -7,6 +7,7 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// Accepted Parameters for context options in google. var AcceptedTbmParameters = []string{ "app", "bks", @@ -19,7 +20,6 @@ var AcceptedTbmParameters = []string{ "rcp", "lcl", } - var AcceptedSearchTypeParameters = []string{ "web_search", "image_search", @@ -109,11 +109,18 @@ func (opt *GoogleHotelsOpts) checkParameterValidity(ctx ContextOption) error { return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") } + if ctx["hotel_occupancy"] != nil && ctx["hotel_occupancy"].(int) < 0 { + return fmt.Errorf("invalid hotel_occupancy parameter: %v", ctx["hotel_occupancy"]) + } + return nil } // checkParameterValidity checks validity of google travel hotels parameters. func (opt *GoogleTravelHotelsOpts) checkParameterValidity(ctx ContextOption) error { + if !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { return fmt.Errorf("invalid render parameter: %v", opt.Render) @@ -123,12 +130,23 @@ func (opt *GoogleTravelHotelsOpts) checkParameterValidity(ctx ContextOption) err return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") } + if ctx["hotel_occupancy"] != nil && ctx["hotel_occupancy"].(int) < 0 { + return fmt.Errorf("invalid hotel_occupancy parameter: %v", ctx["hotel_occupancy"]) + } + + if ctx["hotel_classes"] != nil { + for _, value := range ctx["hotel_classes"].([]int) { + if value < 2 || value > 5 { + return fmt.Errorf("invalid hotel_classes parameter: %v", value) + } + } + } + return nil } // checkParameterValidity checks validity of google trends explore parameters. func (opt *GoogleTrendsExploreOpts) checkParameterValidity(ctx ContextOption) error { - if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) } @@ -137,6 +155,10 @@ func (opt *GoogleTrendsExploreOpts) checkParameterValidity(ctx ContextOption) er return fmt.Errorf("invalid search_type parameter: %v", ctx["search_type"]) } + if ctx["category_id"] != nil && ctx["category_id"].(int) < 0 { + return fmt.Errorf("invalid category_id") + } + return nil } @@ -591,6 +613,7 @@ type GoogleTravelHotelsOpts struct { Limit int Locale string GeoLocation string + UserAgent oxylabs.UserAgent Render oxylabs.Render CallbackURL string Context []func(ContextOption) @@ -627,16 +650,17 @@ func (c *SerpClient) ScrapeGoogleTravelHotels( // Prepare payload. payload := map[string]interface{}{ - "source": "google_travel_hotels", - "domain": opt.Domain, - "query": query, - "start_page": opt.StartPage, - "pages": opt.Pages, - "limit": opt.Limit, - "locale": opt.Locale, - "geo_location": opt.GeoLocation, - "render": opt.Render, - "callback_url": opt.CallbackURL, + "source": "google_travel_hotels", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackURL, "context": []map[string]interface{}{ { "key": "hotel_occupancy", @@ -765,6 +789,15 @@ func (c *SerpClient) ScrapeGoogleTrendsExplore( modifier(context) } + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + // Prepare payload. payload := map[string]interface{}{ "source": "google_trends_explore", @@ -795,7 +828,7 @@ func (c *SerpClient) ScrapeGoogleTrendsExplore( if err != nil { return nil, fmt.Errorf("error marshalling payload: %v", err) } - fmt.Printf("%+v\n\n", payload) + res, err := c.Req(jsonPayload, false, "POST") if err != nil { return nil, err diff --git a/serp/yandex.go b/serp/yandex.go index 84f7147..c80c828 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -70,7 +70,7 @@ type YandexSearchOpts struct { CallbackUrl string } -// Scrapes Yandex via its search engine. +// ScrapYandexSearch scrapes yandex with yandex_search as source. func (c *SerpClient) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, @@ -126,7 +126,7 @@ type YandexUrlOpts struct { CallbackUrl string } -// Scrapes Yandex via provided url. +// ScapeYandexUrl scrapes yandex with yandex as source. func (c *SerpClient) ScrapeYandexUrl( url string, opts ...*YandexUrlOpts, From f2f2749a3b03c6e56486957d326cf4fd8e00597e Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Mon, 11 Dec 2023 20:59:12 +0500 Subject: [PATCH 09/27] check for async runtime models --- serp/client.go | 10 +++- serp/yandex_async.go | 131 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 serp/yandex_async.go diff --git a/serp/client.go b/serp/client.go index 68bc750..49fb116 100644 --- a/serp/client.go +++ b/serp/client.go @@ -30,12 +30,18 @@ func Init( } } +type SerpClientAsync struct { + HttpClient *http.Client + ApiCredentials *ApiCredentials + BaseUrl string +} + // Init for Async runtime model. func InitAsync( username string, password string, -) *SerpClient { - return &SerpClient{ +) *SerpClientAsync { + return &SerpClientAsync{ ApiCredentials: &ApiCredentials{ Username: username, Password: password, diff --git a/serp/yandex_async.go b/serp/yandex_async.go new file mode 100644 index 0000000..49f9dc1 --- /dev/null +++ b/serp/yandex_async.go @@ -0,0 +1,131 @@ +package serp + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" +) + +func (c *SerpClientAsync) ScrapeYandexSearch( + query string, + opts ...*YandexSearchOpts, +) (chan *Response, error) { + ResponseChan := make(chan *Response) + + opt := &YandexSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "yandex_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + client := &http.Client{} + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, _ := client.Do(request) + + responseBody, _ := io.ReadAll(response.Body) + + // unmarshal into job object + job := &Job{} + json.Unmarshal(responseBody, &job) + + request, _ = http.NewRequest("GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + + go func() { + for { + response, _ = client.Do(request) + responseBody, _ = io.ReadAll(response.Body) + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest("GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, _ = client.Do(request) + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + return + } + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + ResponseChan <- resp + } + + time.Sleep(2 * time.Second) + } + }() + + if err != nil { + return nil, err + } + + defer response.Body.Close() + + return ResponseChan, nil +} From 14e3d24e71f64bb8356beb7f984ec0bd105e28a7 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 12 Dec 2023 12:01:33 +0500 Subject: [PATCH 10/27] Apply suggestions from code review v3 + yandex --- oxylabs/common.go | 6 ++ serp/yandex_async.go | 194 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 186 insertions(+), 14 deletions(-) diff --git a/oxylabs/common.go b/oxylabs/common.go index 9735207..1337111 100644 --- a/oxylabs/common.go +++ b/oxylabs/common.go @@ -4,6 +4,12 @@ import ( "fmt" "net/url" "strings" + "time" +) + +var ( + DefaultTimeout = 30 * time.Second + DefaultWaitTime = 2 * time.Second ) func ValidateURL( diff --git a/serp/yandex_async.go b/serp/yandex_async.go index 49f9dc1..3c9dbbc 100644 --- a/serp/yandex_async.go +++ b/serp/yandex_async.go @@ -7,13 +7,16 @@ import ( "io" "net/http" "time" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// Scrapes yandex with yandex_search as source with async polling runtime. func (c *SerpClientAsync) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, ) (chan *Response, error) { - ResponseChan := make(chan *Response) + responseChan := make(chan *Response) opt := &YandexSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -60,38 +63,194 @@ func (c *SerpClientAsync) ScrapeYandexSearch( request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, _ := client.Do(request) + response, err := client.Do(request) + if err != nil { + return nil, err + } responseBody, _ := io.ReadAll(response.Body) + response.Body.Close() - // unmarshal into job object + // Unmarshal into job. job := &Job{} json.Unmarshal(responseBody, &job) - request, _ = http.NewRequest("GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = client.Do(request) + if err != nil { + return + } + + responseBody, _ = io.ReadAll(response.Body) + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = client.Do(request) + if err != nil { + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + responseChan <- resp + } else if job.Status == "failed" { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeYandexUrl scrapes yandex with yandex as source with async polling runtime. +func (c *SerpClientAsync) ScrapeYandexUrl( + url string, + opts ...*YandexUrlOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + + // Check validity of url. + err := oxylabs.ValidateURL(url, "yandex") + if err != nil { + return nil, err + } + + opt := &YandexUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "yandex", + "url": url, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + client := &http.Client{} + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), ) + request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := client.Do(request) + if err != nil { + return nil, err + } + + responseBody, _ := io.ReadAll(response.Body) + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) go func() { + startNow := time.Now() + for { - response, _ = client.Do(request) + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = client.Do(request) + if err != nil { + return + } + responseBody, _ = io.ReadAll(response.Body) + response.Body.Close() json.Unmarshal(responseBody, &job) if job.Status == "done" { JobId := job.ID - request, _ = http.NewRequest("GET", + request, _ = http.NewRequest( + "GET", fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), nil, ) request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, _ = client.Do(request) + response, err = client.Do(request) + if err != nil { + return + } // Read the response body into a buffer. responseBody, err := io.ReadAll(response.Body) @@ -99,6 +258,7 @@ func (c *SerpClientAsync) ScrapeYandexSearch( err = fmt.Errorf("error reading response body: %v", err) return } + response.Body.Close() // Send back error message. if response.StatusCode != 200 { @@ -114,10 +274,18 @@ func (c *SerpClientAsync) ScrapeYandexSearch( } resp.StatusCode = response.StatusCode resp.Status = response.Status - ResponseChan <- resp + responseChan <- resp + } else if job.Status == "failed" { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return } - time.Sleep(2 * time.Second) + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) } }() @@ -125,7 +293,5 @@ func (c *SerpClientAsync) ScrapeYandexSearch( return nil, err } - defer response.Body.Close() - - return ResponseChan, nil + return responseChan, nil } From f7dd301a8ddeaca18e6e3a384404388872e2a0c4 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 12 Dec 2023 13:16:22 +0500 Subject: [PATCH 11/27] bing and baidu async models + some improvements --- serp/baidu.go | 5 + serp/baidu_async.go | 294 ++++++++++++++++++++++++++++++++++++++++++ serp/bing_async.go | 298 +++++++++++++++++++++++++++++++++++++++++++ serp/yandex_async.go | 20 +-- 4 files changed, 604 insertions(+), 13 deletions(-) create mode 100644 serp/baidu_async.go create mode 100644 serp/bing_async.go diff --git a/serp/baidu.go b/serp/baidu.go index 26faf10..6d24c54 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -23,6 +23,10 @@ func (opt *BaiduSearchOpts) checkParameterValidity() error { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) } + if opt.Limit <= 0 || opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + return nil } @@ -59,6 +63,7 @@ func (c *SerpClient) ScrapeBaiduSearch( SetDefaultDomain(&opt.Domain) SetDefaultStartPage(&opt.StartPage) SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) SetDefaultUserAgent(&opt.UserAgent) // Check validity of parameters. diff --git a/serp/baidu_async.go b/serp/baidu_async.go new file mode 100644 index 0000000..7a46cf4 --- /dev/null +++ b/serp/baidu_async.go @@ -0,0 +1,294 @@ +package serp + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// ScrapeBingSearch scrapes bing with bing_search as source with async polling runtime. +func (c *SerpClientAsync) ScrapeBaiduSearch( + query string, + opts ...*BaiduSearchOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + + // Prepare options + opt := &BaiduSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "baidu_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, _ := io.ReadAll(response.Body) + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + err = json.Unmarshal(responseBody, &job) + if err != nil { + return nil, fmt.Errorf("error unmarshalling response body: %v", err) + } + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + responseBody, _ = io.ReadAll(response.Body) + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + responseChan <- resp + } else if job.Status == "failed" { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeBingUrl scrapes bing with bing as source with async polling runtime. +func (c *SerpClientAsync) ScrapeBaiduUrl( + url string, + opts ...*BaiduUrlOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + + // Check validity of url. + err := oxylabs.ValidateURL(url, "baidu") + if err != nil { + return nil, err + } + + // Prepare options + opt := &BaiduUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "baidu", + "url": url, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, _ := io.ReadAll(response.Body) + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + responseBody, _ = io.ReadAll(response.Body) + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + responseChan <- resp + } else if job.Status == "failed" { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + if err != nil { + return nil, err + } + + return responseChan, nil +} diff --git a/serp/bing_async.go b/serp/bing_async.go new file mode 100644 index 0000000..2515d7c --- /dev/null +++ b/serp/bing_async.go @@ -0,0 +1,298 @@ +package serp + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// ScrapeBingSearch scrapes bing with bing_search as source with async polling runtime. +func (c *SerpClientAsync) ScrapeBingSearch( + query string, + opts ...*BingSearchOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + + // Prepare options + opt := &BingSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "bing_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackUrl, + "render": opt.Render, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, _ := io.ReadAll(response.Body) + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + responseBody, _ = io.ReadAll(response.Body) + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + responseChan <- resp + } else if job.Status == "failed" { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeBingUrl scrapes bing with bing as source with async polling runtime. +func (c *SerpClientAsync) ScrapeBingUrl( + url string, + opts ...*BingUrlOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + + // Check validity of url. + err := oxylabs.ValidateURL(url, "bing") + if err != nil { + return nil, err + } + + // Prepare options. + opt := &BingUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "bing", + "url": url, + "user_agent_type": opt.UserAgent, + "geo_location": opt.GeoLocation, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, _ := io.ReadAll(response.Body) + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + responseBody, _ = io.ReadAll(response.Body) + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + responseChan <- resp + } else if job.Status == "failed" { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + if err != nil { + return nil, err + } + + return responseChan, nil +} diff --git a/serp/yandex_async.go b/serp/yandex_async.go index 3c9dbbc..cb6fd38 100644 --- a/serp/yandex_async.go +++ b/serp/yandex_async.go @@ -11,7 +11,7 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// Scrapes yandex with yandex_search as source with async polling runtime. +// ScrapeYandexSearch scrapes yandex with yandex_search as source with async polling runtime. func (c *SerpClientAsync) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, @@ -54,16 +54,14 @@ func (c *SerpClientAsync) ScrapeYandexSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - client := &http.Client{} request, _ := http.NewRequest( "POST", c.BaseUrl, bytes.NewBuffer(jsonPayload), ) - request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := client.Do(request) + response, err := c.HttpClient.Do(request) if err != nil { return nil, err } @@ -86,7 +84,7 @@ func (c *SerpClientAsync) ScrapeYandexSearch( ) request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = client.Do(request) + response, err = c.HttpClient.Do(request) if err != nil { return } @@ -103,10 +101,9 @@ func (c *SerpClientAsync) ScrapeYandexSearch( fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), nil, ) - request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = client.Do(request) + response, err = c.HttpClient.Do(request) if err != nil { return } @@ -195,16 +192,14 @@ func (c *SerpClientAsync) ScrapeYandexUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - client := &http.Client{} request, _ := http.NewRequest( "POST", c.BaseUrl, bytes.NewBuffer(jsonPayload), ) - request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := client.Do(request) + response, err := c.HttpClient.Do(request) if err != nil { return nil, err } @@ -227,7 +222,7 @@ func (c *SerpClientAsync) ScrapeYandexUrl( ) request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = client.Do(request) + response, err = c.HttpClient.Do(request) if err != nil { return } @@ -244,10 +239,9 @@ func (c *SerpClientAsync) ScrapeYandexUrl( fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), nil, ) - request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = client.Do(request) + response, err = c.HttpClient.Do(request) if err != nil { return } From 905ed7c27549c2da3a2f3f041ff3a7641f5cc3a2 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 12 Dec 2023 17:06:38 +0500 Subject: [PATCH 12/27] 2 google funcs + better error handling with channels --- serp/baidu_async.go | 69 ++++++- serp/bing_async.go | 69 ++++++- serp/google_async.go | 438 +++++++++++++++++++++++++++++++++++++++++++ serp/yandex_async.go | 62 +++++- 4 files changed, 613 insertions(+), 25 deletions(-) create mode 100644 serp/google_async.go diff --git a/serp/baidu_async.go b/serp/baidu_async.go index 7a46cf4..7865978 100644 --- a/serp/baidu_async.go +++ b/serp/baidu_async.go @@ -17,6 +17,7 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( opts ...*BaiduSearchOpts, ) (chan *Response, error) { responseChan := make(chan *Response) + errChan := make(chan error) // Prepare options opt := &BaiduSearchOpts{} @@ -65,7 +66,10 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( return nil, err } - responseBody, _ := io.ReadAll(response.Body) + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } response.Body.Close() // Unmarshal into job. @@ -88,10 +92,18 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } - responseBody, _ = io.ReadAll(response.Body) + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } response.Body.Close() json.Unmarshal(responseBody, &job) @@ -107,6 +119,8 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -114,6 +128,8 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( responseBody, err := io.ReadAll(response.Body) if err != nil { err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) return } response.Body.Close() @@ -121,6 +137,8 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( // Send back error message. if response.StatusCode != 200 { err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) return } @@ -128,18 +146,25 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( resp := &Response{} if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) return } resp.StatusCode = response.StatusCode resp.Status = response.Status + close(errChan) responseChan <- resp - } else if job.Status == "failed" { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) return } if time.Since(startNow) > oxylabs.DefaultTimeout { err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) return } @@ -147,6 +172,7 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( } }() + err = <-errChan if err != nil { return nil, err } @@ -160,6 +186,7 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( opts ...*BaiduUrlOpts, ) (chan *Response, error) { responseChan := make(chan *Response) + errChan := make(chan error) // Check validity of url. err := oxylabs.ValidateURL(url, "baidu") @@ -206,7 +233,10 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( return nil, err } - responseBody, _ := io.ReadAll(response.Body) + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } response.Body.Close() // Unmarshal into job. @@ -226,10 +256,18 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } - responseBody, _ = io.ReadAll(response.Body) + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } response.Body.Close() json.Unmarshal(responseBody, &job) @@ -241,11 +279,12 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), nil, ) - request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -253,6 +292,8 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( responseBody, err := io.ReadAll(response.Body) if err != nil { err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) return } response.Body.Close() @@ -260,6 +301,8 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( // Send back error message. if response.StatusCode != 200 { err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) return } @@ -267,18 +310,25 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( resp := &Response{} if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) return } resp.StatusCode = response.StatusCode resp.Status = response.Status + close(errChan) responseChan <- resp - } else if job.Status == "failed" { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) return } if time.Since(startNow) > oxylabs.DefaultTimeout { err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) return } @@ -286,6 +336,7 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( } }() + err = <-errChan if err != nil { return nil, err } diff --git a/serp/bing_async.go b/serp/bing_async.go index 2515d7c..73322e7 100644 --- a/serp/bing_async.go +++ b/serp/bing_async.go @@ -17,6 +17,7 @@ func (c *SerpClientAsync) ScrapeBingSearch( opts ...*BingSearchOpts, ) (chan *Response, error) { responseChan := make(chan *Response) + errChan := make(chan error) // Prepare options opt := &BingSearchOpts{} @@ -69,7 +70,10 @@ func (c *SerpClientAsync) ScrapeBingSearch( return nil, err } - responseBody, _ := io.ReadAll(response.Body) + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } response.Body.Close() // Unmarshal into job. @@ -89,10 +93,18 @@ func (c *SerpClientAsync) ScrapeBingSearch( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } - responseBody, _ = io.ReadAll(response.Body) + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } response.Body.Close() json.Unmarshal(responseBody, &job) @@ -104,11 +116,12 @@ func (c *SerpClientAsync) ScrapeBingSearch( fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), nil, ) - request.Header.Add("Content-type", "application/json") request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -116,6 +129,8 @@ func (c *SerpClientAsync) ScrapeBingSearch( responseBody, err := io.ReadAll(response.Body) if err != nil { err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) return } response.Body.Close() @@ -123,6 +138,8 @@ func (c *SerpClientAsync) ScrapeBingSearch( // Send back error message. if response.StatusCode != 200 { err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) return } @@ -130,18 +147,25 @@ func (c *SerpClientAsync) ScrapeBingSearch( resp := &Response{} if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) return } resp.StatusCode = response.StatusCode resp.Status = response.Status + close(errChan) responseChan <- resp - } else if job.Status == "failed" { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) return } if time.Since(startNow) > oxylabs.DefaultTimeout { err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) return } @@ -149,6 +173,7 @@ func (c *SerpClientAsync) ScrapeBingSearch( } }() + err = <-errChan if err != nil { return nil, err } @@ -162,6 +187,7 @@ func (c *SerpClientAsync) ScrapeBingUrl( opts ...*BingUrlOpts, ) (chan *Response, error) { responseChan := make(chan *Response) + errChan := make(chan error) // Check validity of url. err := oxylabs.ValidateURL(url, "bing") @@ -211,7 +237,10 @@ func (c *SerpClientAsync) ScrapeBingUrl( return nil, err } - responseBody, _ := io.ReadAll(response.Body) + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } response.Body.Close() // Unmarshal into job. @@ -231,10 +260,18 @@ func (c *SerpClientAsync) ScrapeBingUrl( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } - responseBody, _ = io.ReadAll(response.Body) + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } response.Body.Close() json.Unmarshal(responseBody, &job) @@ -250,6 +287,8 @@ func (c *SerpClientAsync) ScrapeBingUrl( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -257,6 +296,8 @@ func (c *SerpClientAsync) ScrapeBingUrl( responseBody, err := io.ReadAll(response.Body) if err != nil { err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) return } response.Body.Close() @@ -264,6 +305,8 @@ func (c *SerpClientAsync) ScrapeBingUrl( // Send back error message. if response.StatusCode != 200 { err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) return } @@ -271,18 +314,25 @@ func (c *SerpClientAsync) ScrapeBingUrl( resp := &Response{} if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) return } resp.StatusCode = response.StatusCode resp.Status = response.Status + close(errChan) responseChan <- resp - } else if job.Status == "failed" { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) return } if time.Since(startNow) > oxylabs.DefaultTimeout { err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) return } @@ -290,6 +340,7 @@ func (c *SerpClientAsync) ScrapeBingUrl( } }() + err = <-errChan if err != nil { return nil, err } diff --git a/serp/google_async.go b/serp/google_async.go new file mode 100644 index 0000000..285723a --- /dev/null +++ b/serp/google_async.go @@ -0,0 +1,438 @@ +package serp + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// ScrapeGoogleSearch scrapes google with google_search as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleSearch( + query string, + opts ...*GoogleSearchOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Prepare options. + opt := &GoogleSearchOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map and apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Check if limit_per_page context parameter is used together with limit, start_page or pages parameters. + if (opt.Limit != 0 || opt.StartPage != 0 || opt.Pages != 0) && context["limit_per_page"] != nil { + return nil, fmt.Errorf("limit, start_page and pages parameters cannot be used together with limit_per_page context parameter") + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + var payload map[string]interface{} + + // If user sends limit_per_page context parameter, use it instead of limit, start_page and pages parameters. + if context["limit_per_page"] != nil { + payload = map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "geo_location": opt.Geolocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "filter", + "value": context["filter"], + }, + { + "key": "limit_per_page", + "value": context["limit_per_page"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "safe_search", + "value": context["safe_search"], + }, + { + "key": "fpstate", + "value": context["fpstate"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } + } else { + payload = map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "geo_location": opt.Geolocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "filter", + "value": context["filter"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "safe_search", + "value": context["safe_search"], + }, + { + "key": "fpstate", + "value": context["fpstate"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } + } + + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeGoogleUrl scrapes google with google as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleUrl( + url string, + opts ...*GoogleUrlOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Check validity of url. + err := oxylabs.ValidateURL(url, "google") + if err != nil { + return nil, err + } + + // Prepare options. + opt := &GoogleUrlOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err = opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google", + "url": url, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + "geo_location": opt.GeoLocation, + "parse": opt.Parse, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} diff --git a/serp/yandex_async.go b/serp/yandex_async.go index cb6fd38..50e1409 100644 --- a/serp/yandex_async.go +++ b/serp/yandex_async.go @@ -17,7 +17,9 @@ func (c *SerpClientAsync) ScrapeYandexSearch( opts ...*YandexSearchOpts, ) (chan *Response, error) { responseChan := make(chan *Response) + errChan := make(chan error) + // Prepare options. opt := &YandexSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] @@ -66,7 +68,10 @@ func (c *SerpClientAsync) ScrapeYandexSearch( return nil, err } - responseBody, _ := io.ReadAll(response.Body) + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } response.Body.Close() // Unmarshal into job. @@ -86,10 +91,18 @@ func (c *SerpClientAsync) ScrapeYandexSearch( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } - responseBody, _ = io.ReadAll(response.Body) + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } response.Body.Close() json.Unmarshal(responseBody, &job) @@ -105,6 +118,8 @@ func (c *SerpClientAsync) ScrapeYandexSearch( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -112,6 +127,8 @@ func (c *SerpClientAsync) ScrapeYandexSearch( responseBody, err := io.ReadAll(response.Body) if err != nil { err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) return } response.Body.Close() @@ -119,6 +136,8 @@ func (c *SerpClientAsync) ScrapeYandexSearch( // Send back error message. if response.StatusCode != 200 { err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) return } @@ -126,18 +145,25 @@ func (c *SerpClientAsync) ScrapeYandexSearch( resp := &Response{} if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) return } resp.StatusCode = response.StatusCode resp.Status = response.Status + close(errChan) responseChan <- resp - } else if job.Status == "failed" { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) return } if time.Since(startNow) > oxylabs.DefaultTimeout { err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) return } @@ -145,6 +171,7 @@ func (c *SerpClientAsync) ScrapeYandexSearch( } }() + err = <-errChan if err != nil { return nil, err } @@ -158,6 +185,7 @@ func (c *SerpClientAsync) ScrapeYandexUrl( opts ...*YandexUrlOpts, ) (chan *Response, error) { responseChan := make(chan *Response) + errChan := make(chan error) // Check validity of url. err := oxylabs.ValidateURL(url, "yandex") @@ -165,6 +193,7 @@ func (c *SerpClientAsync) ScrapeYandexUrl( return nil, err } + // Prepare options. opt := &YandexUrlOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] @@ -204,7 +233,10 @@ func (c *SerpClientAsync) ScrapeYandexUrl( return nil, err } - responseBody, _ := io.ReadAll(response.Body) + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } response.Body.Close() // Unmarshal into job. @@ -224,6 +256,8 @@ func (c *SerpClientAsync) ScrapeYandexUrl( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -243,6 +277,8 @@ func (c *SerpClientAsync) ScrapeYandexUrl( request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) response, err = c.HttpClient.Do(request) if err != nil { + errChan <- err + close(responseChan) return } @@ -250,6 +286,8 @@ func (c *SerpClientAsync) ScrapeYandexUrl( responseBody, err := io.ReadAll(response.Body) if err != nil { err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) return } response.Body.Close() @@ -257,6 +295,8 @@ func (c *SerpClientAsync) ScrapeYandexUrl( // Send back error message. if response.StatusCode != 200 { err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) return } @@ -264,18 +304,25 @@ func (c *SerpClientAsync) ScrapeYandexUrl( resp := &Response{} if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) return } resp.StatusCode = response.StatusCode resp.Status = response.Status + close(errChan) responseChan <- resp - } else if job.Status == "failed" { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) return } if time.Since(startNow) > oxylabs.DefaultTimeout { err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) return } @@ -283,6 +330,7 @@ func (c *SerpClientAsync) ScrapeYandexUrl( } }() + err = <-errChan if err != nil { return nil, err } From 1420df5cfac435cc4932ed37bad022f92772271d Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 12 Dec 2023 19:43:42 +0500 Subject: [PATCH 13/27] rest of google sources for async polling model --- oxylabs/common.go | 2 +- serp/google.go | 16 +- serp/google_async.go | 1102 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1114 insertions(+), 6 deletions(-) diff --git a/oxylabs/common.go b/oxylabs/common.go index 1337111..0f3d6c5 100644 --- a/oxylabs/common.go +++ b/oxylabs/common.go @@ -8,7 +8,7 @@ import ( ) var ( - DefaultTimeout = 30 * time.Second + DefaultTimeout = 50 * time.Second DefaultWaitTime = 2 * time.Second ) diff --git a/serp/google.go b/serp/google.go index fbdf29d..d623877 100644 --- a/serp/google.go +++ b/serp/google.go @@ -703,9 +703,15 @@ type GoogleImagesOpts struct { // ScrapeGoogleImages scrapes google via the google_images source. func (c *SerpClient) ScrapeGoogleImages( - query string, + url string, opts ...*GoogleImagesOpts, ) (*Response, error) { + // Check validity of url. + err := oxylabs.ValidateURL(url, "google") + if err != nil { + return nil, err + } + // Prepare options. opt := &GoogleImagesOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { @@ -724,16 +730,16 @@ func (c *SerpClient) ScrapeGoogleImages( SetDefaultPages(&opt.Pages) // Check validity of parameters. - err := opt.checkParameterValidity(context) + err = opt.checkParameterValidity(context) if err != nil { return nil, err } // Prepare payload. payload := map[string]interface{}{ - "source": "google_travel_hotels", + "source": "google_images", "domain": opt.Domain, - "query": query, + "query": url, "start_page": opt.StartPage, "pages": opt.Pages, "locale": opt.Locale, @@ -766,7 +772,7 @@ func (c *SerpClient) ScrapeGoogleImages( } type GoogleTrendsExploreOpts struct { - GeoLocation string + GeoLocation *string Context []func(ContextOption) UserAgent oxylabs.UserAgent CallbackURL string diff --git a/serp/google_async.go b/serp/google_async.go index 285723a..a9635b4 100644 --- a/serp/google_async.go +++ b/serp/google_async.go @@ -436,3 +436,1105 @@ func (c *SerpClientAsync) ScrapeGoogleUrl( return responseChan, nil } + +// ScrapeGoogleAds scrapes google with google_ads as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleAds( + query string, + opts ...*GoogleAdsOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Prepare options. + opt := &GoogleAdsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + payload := map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeGoogleSuggestions scrapes google with google_suggestions as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleSuggestions( + query string, + opts ...*GoogleSuggestionsOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Prepare options. + opt := &GoogleSuggestionsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity() + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_suggestions", + "query": query, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackUrl, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeGoogleTravelHotels scrapes google with google_hotels as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleHotels( + query string, + opts ...*GoogleHotelsOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Prepare options. + opt := &GoogleHotelsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_hotels", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "results_language": opt.ResultsLanguage, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackURL, + "context": []map[string]interface{}{ + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "hotel_occupancy", + "value": context["hotel_occupancy"], + }, + { + "key": "hotel_dates", + "value": context["hotel_dates"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeGoogleTravelHotels scrapes google with google_travel_hotels as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleTravelHotels( + query string, + opts ...*GoogleTravelHotelsOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Prepare options. + opt := &GoogleTravelHotelsOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultLimit(&opt.Limit) + SetDefaultPages(&opt.Pages) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_travel_hotels", + "domain": opt.Domain, + "query": query, + "start_page": opt.StartPage, + "pages": opt.Pages, + "limit": opt.Limit, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackURL, + "context": []map[string]interface{}{ + { + "key": "hotel_occupancy", + "value": context["hotel_occupancy"], + }, + { + "key": "hotel_classes", + "value": context["hotel_classes"], + }, + { + "key": "hotel_dates", + "value": context["hotel_dates"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeGoogleImages scrapes google with google_images as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleImages( + url string, + opts ...*GoogleImagesOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Check validity of url. + err := oxylabs.ValidateURL(url, "google") + if err != nil { + return nil, err + } + + // Prepare options. + opt := &GoogleImagesOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultDomain(&opt.Domain) + SetDefaultStartPage(&opt.StartPage) + SetDefaultPages(&opt.Pages) + + // Check validity of parameters. + err = opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_images", + "domain": opt.Domain, + "query": url, + "start_page": opt.StartPage, + "pages": opt.Pages, + "locale": opt.Locale, + "geo_location": opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "render": opt.Render, + "callback_url": opt.CallbackURL, + "context": []map[string]interface{}{ + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "results_language", + "value": context["results_language"], + }, + }, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} + +// ScrapeGoogleTrendsExplore scrapes google with google_trends_explore as source with async polling runtime. +func (c *SerpClientAsync) ScrapeGoogleTrendsExplore( + query string, + opts ...*GoogleTrendsExploreOpts, +) (chan *Response, error) { + responseChan := make(chan *Response) + errChan := make(chan error) + + // Prepare options. + opt := &GoogleTrendsExploreOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Initialize the context map apply each provided context modifier function. + context := make(ContextOption) + for _, modifier := range opt.Context { + modifier(context) + } + + // Set defaults. + SetDefaultUserAgent(&opt.UserAgent) + + // Check validity of parameters. + err := opt.checkParameterValidity(context) + if err != nil { + return nil, err + } + + // Prepare payload. + payload := map[string]interface{}{ + "source": "google_trends_explore", + "query": query, + "geo_location": &opt.GeoLocation, + "context": []map[string]interface{}{ + { + "key": "search_type", + "value": context["search_type"], + }, + { + "key": "date_from", + "value": context["date_from"], + }, + { + "key": "date_to", + "value": context["date_to"], + }, + { + "key": "category_id", + "value": context["category_id"], + }, + }, + "user_agent_type": opt.UserAgent, + "callback_url": opt.CallbackURL, + } + jsonPayload, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("error marshalling payload: %v", err) + } + + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, err + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + + if response.StatusCode == 400 { + return nil, fmt.Errorf("error with status code %v: %v", response.StatusCode, string(responseBody)) + } + + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + json.Unmarshal(responseBody, &job) + + go func() { + startNow := time.Now() + + for { + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + responseBody, err = io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + json.Unmarshal(responseBody, &job) + + if job.Status == "done" { + JobId := job.ID + request, _ = http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err = c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } + }() + + err = <-errChan + if err != nil { + return nil, err + } + + return responseChan, nil +} From d14237b7405c4fac7ea190b2d7ad2533560d2d3e Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Sat, 16 Dec 2023 19:46:46 +0500 Subject: [PATCH 14/27] parse checks in google_async + some comment fixes --- oxylabs/common.go | 2 +- serp/baidu.go | 4 ++-- serp/baidu_async.go | 4 ++-- serp/bing.go | 2 +- serp/bing_async.go | 2 +- serp/google_async.go | 3 +++ serp/yandex.go | 2 +- 7 files changed, 11 insertions(+), 8 deletions(-) diff --git a/oxylabs/common.go b/oxylabs/common.go index 0f3d6c5..3a7d0c1 100644 --- a/oxylabs/common.go +++ b/oxylabs/common.go @@ -21,7 +21,7 @@ func ValidateURL( return fmt.Errorf("url parameter is empty") } - // Parse the URL + // Parse the URL. parsedURL, err := url.ParseRequestURI(inputURL) if err != nil { return fmt.Errorf("failed to parse URL: %v", err) diff --git a/serp/baidu.go b/serp/baidu.go index 6d24c54..3bbba7d 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -53,7 +53,7 @@ func (c *SerpClient) ScrapeBaiduSearch( query string, opts ...*BaiduSearchOpts, ) (*Response, error) { - // Prepare options + // Prepare options. opt := &BaiduSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] @@ -112,7 +112,7 @@ func (c *SerpClient) ScrapeBaiduUrl( return nil, err } - // Prepare options + // Prepare options. opt := &BaiduUrlOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] diff --git a/serp/baidu_async.go b/serp/baidu_async.go index 7865978..1540a60 100644 --- a/serp/baidu_async.go +++ b/serp/baidu_async.go @@ -19,7 +19,7 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( responseChan := make(chan *Response) errChan := make(chan error) - // Prepare options + // Prepare options. opt := &BaiduSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] @@ -194,7 +194,7 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( return nil, err } - // Prepare options + // Prepare options. opt := &BaiduUrlOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] diff --git a/serp/bing.go b/serp/bing.go index d6153b1..2dde75c 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -63,7 +63,7 @@ func (c *SerpClient) ScrapeBingSearch( query string, opts ...*BingSearchOpts, ) (*Response, error) { - // Prepare options + // Prepare options. opt := &BingSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] diff --git a/serp/bing_async.go b/serp/bing_async.go index 73322e7..47318d8 100644 --- a/serp/bing_async.go +++ b/serp/bing_async.go @@ -19,7 +19,7 @@ func (c *SerpClientAsync) ScrapeBingSearch( responseChan := make(chan *Response) errChan := make(chan error) - // Prepare options + // Prepare options. opt := &BingSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] diff --git a/serp/google_async.go b/serp/google_async.go index a9635b4..df4f530 100644 --- a/serp/google_async.go +++ b/serp/google_async.go @@ -234,6 +234,7 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( // Unmarshal the JSON object. resp := &Response{} + resp.Parse = opt.Parse if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) errChan <- err @@ -401,6 +402,7 @@ func (c *SerpClientAsync) ScrapeGoogleUrl( // Unmarshal the JSON object. resp := &Response{} + resp.Parse = opt.Parse if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) errChan <- err @@ -589,6 +591,7 @@ func (c *SerpClientAsync) ScrapeGoogleAds( // Unmarshal the JSON object. resp := &Response{} + resp.Parse = opt.Parse if err := resp.UnmarshalJSON(responseBody); err != nil { err = fmt.Errorf("failed to parse JSON object: %v", err) errChan <- err diff --git a/serp/yandex.go b/serp/yandex.go index c80c828..f84cecc 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -75,7 +75,7 @@ func (c *SerpClient) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, ) (*Response, error) { - // Prepare options + // Prepare options. opt := &YandexSearchOpts{} if len(opts) > 0 && opts[len(opts)-1] != nil { opt = opts[len(opts)-1] From d3c3c640c4d7c4ce53ffdc3aef1eb69509da3729 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Sat, 16 Dec 2023 19:47:48 +0500 Subject: [PATCH 15/27] proxy endpoint integration method --- serp/client.go | 46 ++++++++++++++++++ serp/proxy_endpint.go | 106 ++++++++++++++++++++++++++++++++++++++++++ serp/response.go | 7 +++ 3 files changed, 159 insertions(+) create mode 100644 serp/proxy_endpint.go diff --git a/serp/client.go b/serp/client.go index 49fb116..a3028c3 100644 --- a/serp/client.go +++ b/serp/client.go @@ -1,7 +1,10 @@ package serp import ( + "crypto/tls" + "fmt" "net/http" + "net/url" ) type ApiCredentials struct { @@ -50,3 +53,46 @@ func InitAsync( BaseUrl: "https://data.oxylabs.io/v1/queries", } } + +type SerpClientProxy struct { + HttpClient *http.Client + ApiCredentials *ApiCredentials +} + +// Init for Proxy runtime model. +func InitProxy( + username string, + password string, +) *SerpClientProxy { + // Prepare API credentials. + apiCredentials := &ApiCredentials{ + Username: username, + Password: password, + } + + // Prepare proxy url. + proxyUrl, err := url.Parse( + fmt.Sprintf( + "http://%s:%s@realtime.oxylabs.io:60000", + apiCredentials.Username, + apiCredentials.Password, + ), + ) + if err != nil { + fmt.Printf("error parsing proxy url: %v", err) + } + + // Prepare custom transport. + customTransport := &http.Transport{Proxy: http.ProxyURL(proxyUrl)} + customTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + + client := &http.Client{Transport: customTransport} + + return &SerpClientProxy{ + ApiCredentials: &ApiCredentials{ + Username: username, + Password: password, + }, + HttpClient: client, + } +} diff --git a/serp/proxy_endpint.go b/serp/proxy_endpint.go new file mode 100644 index 0000000..a671c65 --- /dev/null +++ b/serp/proxy_endpint.go @@ -0,0 +1,106 @@ +package serp + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +type ScrapeProxyOpts struct { + UserAgent oxylabs.UserAgent + GeoLocation string + Render oxylabs.Render + Parser *string +} + +// checkParameterValidity checks validity of google search parameters. +func (opt *ScrapeProxyOpts) checkParameterValidity() error { + if opt.UserAgent != "" && !oxylabs.IsUserAgentValid(opt.UserAgent) { + return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) + } + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + return nil +} + +// ScrapeProxyUrl scrapes via proxy endpoint. +func (c *SerpClientProxy) ScrapeProxyUrl( + url string, + opts ...*ScrapeProxyOpts, +) (*ResponseProxy, error) { + // Prepare options. + opt := &ScrapeProxyOpts{} + if len(opts) > 0 && opts[len(opts)-1] != nil { + opt = opts[len(opts)-1] + } + + // Check validity of parameters. + if err := opt.checkParameterValidity(); err != nil { + return nil, err + } + + // Prepare request. + request, err := http.NewRequest( + "GET", + url, + nil, + ) + if err != nil { + return nil, fmt.Errorf("error creating request: %v", err) + } + + // If options are provided, add them to the request. + if opt.UserAgent != "" { + request.Header.Add("x-oxylabs-user-agent-type", string(opt.UserAgent)) + } + if opt.GeoLocation != "" { + request.Header.Add("x-oxylabs-geo-location", opt.GeoLocation) + } + if opt.Render != "" { + request.Header.Add("x-oxylabs-render", string(opt.Render)) + } + if opt.Parser != nil { + request.Header.Add("x-oxylabs-parse", "1") + request.Header.Add("x-oxylabs-parser", *opt.Parser) + } + + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + + // Get response. + response, err := c.HttpClient.Do(request) + if err != nil { + return nil, fmt.Errorf("error making request: %v", err) + } + defer response.Body.Close() + + // Read response body. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + + // Send back error message. + if response.StatusCode != 200 { + return nil, fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + } + + // Prepare response. + resp := &ResponseProxy{} + if opt.Parser != nil { + json.Unmarshal(responseBody, &resp.ContentParsed) + } else { + resp.Content = string(responseBody) + } + + // Set status code and status. + resp.StatusCode = response.StatusCode + resp.Status = response.Status + + return resp, nil +} diff --git a/serp/response.go b/serp/response.go index fe8fb18..47d7279 100644 --- a/serp/response.go +++ b/serp/response.go @@ -91,6 +91,13 @@ type Response struct { Status string `json:"status"` } +type ResponseProxy struct { + ContentParsed Content + Content string + StatusCode int + Status string +} + type Job struct { CallbackURL string `json:"callback_url"` ClientID int `json:"client_id"` From ace829f3f56e382b10ca789fa23f6e15772dce93 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Sat, 16 Dec 2023 21:32:27 +0500 Subject: [PATCH 16/27] send custom headers with proxy endpoint --- serp/proxy_endpint.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/serp/proxy_endpint.go b/serp/proxy_endpint.go index a671c65..010d8c4 100644 --- a/serp/proxy_endpint.go +++ b/serp/proxy_endpint.go @@ -10,10 +10,11 @@ import ( ) type ScrapeProxyOpts struct { - UserAgent oxylabs.UserAgent - GeoLocation string - Render oxylabs.Render - Parser *string + UserAgent oxylabs.UserAgent + GeoLocation string + Render oxylabs.Render + Parser *string + CustomHeaders map[string]string } // checkParameterValidity checks validity of google search parameters. @@ -69,6 +70,11 @@ func (c *SerpClientProxy) ScrapeProxyUrl( request.Header.Add("x-oxylabs-parse", "1") request.Header.Add("x-oxylabs-parser", *opt.Parser) } + if opt.CustomHeaders != nil { + for key, value := range opt.CustomHeaders { + request.Header.Add(key, value) + } + } request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) From 0d54a2200ad666950e3e9a811a1632f328ffa856 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Mon, 18 Dec 2023 12:16:05 +0500 Subject: [PATCH 17/27] make GeoLocation param a ptr --- serp/bing.go | 4 ++-- serp/google.go | 16 ++++++++-------- serp/google_async.go | 4 ++-- serp/yandex.go | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/serp/bing.go b/serp/bing.go index 2dde75c..7c65e26 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -52,7 +52,7 @@ type BingSearchOpts struct { Pages int Limit int Locale oxylabs.Locale - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent CallbackUrl string Render oxylabs.Render @@ -111,7 +111,7 @@ func (c *SerpClient) ScrapeBingSearch( type BingUrlOpts struct { UserAgent oxylabs.UserAgent - GeoLocation string + GeoLocation *string Render oxylabs.Render CallbackUrl string } diff --git a/serp/google.go b/serp/google.go index d623877..9d68ff7 100644 --- a/serp/google.go +++ b/serp/google.go @@ -168,7 +168,7 @@ type GoogleSearchOpts struct { Pages int Limit int Locale oxylabs.Locale - Geolocation string + GeoLocation *string UserAgent oxylabs.UserAgent Render oxylabs.Render Parse bool @@ -219,7 +219,7 @@ func (c *SerpClient) ScrapeGoogleSearch( "source": "google_search", "domain": opt.Domain, "query": query, - "geo_location": opt.Geolocation, + "geo_location": opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -266,7 +266,7 @@ func (c *SerpClient) ScrapeGoogleSearch( "start_page": opt.StartPage, "pages": opt.Pages, "limit": opt.Limit, - "geo_location": opt.Geolocation, + "geo_location": opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -317,7 +317,7 @@ func (c *SerpClient) ScrapeGoogleSearch( } type GoogleUrlOpts struct { - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent Render oxylabs.Render Parse bool @@ -393,7 +393,7 @@ type GoogleAdsOpts struct { Pages int Limit int Locale string - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent Render oxylabs.Render Parse bool @@ -472,7 +472,7 @@ func (c *SerpClient) ScrapeGoogleAds( type GoogleSuggestionsOpts struct { Locale string - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent Render oxylabs.Render CallbackUrl string @@ -612,7 +612,7 @@ type GoogleTravelHotelsOpts struct { Pages int Limit int Locale string - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent Render oxylabs.Render CallbackURL string @@ -694,7 +694,7 @@ type GoogleImagesOpts struct { StartPage int Pages int Locale string - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent Render oxylabs.Render CallbackURL string diff --git a/serp/google_async.go b/serp/google_async.go index df4f530..4e3c88c 100644 --- a/serp/google_async.go +++ b/serp/google_async.go @@ -58,7 +58,7 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( "source": "google_search", "domain": opt.Domain, "query": query, - "geo_location": opt.Geolocation, + "geo_location": opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -105,7 +105,7 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( "start_page": opt.StartPage, "pages": opt.Pages, "limit": opt.Limit, - "geo_location": opt.Geolocation, + "geo_location": opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, diff --git a/serp/yandex.go b/serp/yandex.go index f84cecc..32a2145 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -65,7 +65,7 @@ type YandexSearchOpts struct { Pages int Limit int Locale oxylabs.Locale - GeoLocation string + GeoLocation *string UserAgent oxylabs.UserAgent CallbackUrl string } From 1a9a06bcbbe462df458f14ed5c3556a7a94c7b08 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Mon, 18 Dec 2023 15:53:35 +0500 Subject: [PATCH 18/27] refactor async functions --- serp/async_helpers.go | 164 ++++++++ serp/baidu_async.go | 235 +---------- serp/bing.go | 4 +- serp/bing_async.go | 238 +---------- serp/google.go | 18 +- serp/google_async.go | 940 ++---------------------------------------- serp/yandex.go | 2 +- serp/yandex_async.go | 228 +--------- 8 files changed, 243 insertions(+), 1586 deletions(-) create mode 100644 serp/async_helpers.go diff --git a/serp/async_helpers.go b/serp/async_helpers.go new file mode 100644 index 0000000..37bae41 --- /dev/null +++ b/serp/async_helpers.go @@ -0,0 +1,164 @@ +package serp + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// Helper function to make post request and retrieve Job ID. +func (c *SerpClientAsync) GetJobID( + jsonPayload []byte, +) (string, error) { + request, _ := http.NewRequest( + "POST", + c.BaseUrl, + bytes.NewBuffer(jsonPayload), + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + return "", fmt.Errorf("error performing request: %v", err) + } + + responseBody, err := io.ReadAll(response.Body) + if err != nil { + return "", fmt.Errorf("error reading response body: %v", err) + } + response.Body.Close() + + // Unmarshal into job. + job := &Job{} + if err = json.Unmarshal(responseBody, &job); err != nil { + return "", fmt.Errorf("error unmarshalling Job response body: %v", err) + } + + return job.ID, nil +} + +// Helper function to handle response parsing and error checking +func (c *SerpClientAsync) GetResponse( + jobID string, + parse bool, + responseChan chan *Response, + errChan chan error, +) { + request, _ := http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", jobID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + response.Body.Close() + + // Send back error message. + if response.StatusCode != 200 { + err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) + errChan <- err + close(responseChan) + return + } + + // Unmarshal the JSON object. + resp := &Response{} + resp.Parse = parse + if err := resp.UnmarshalJSON(responseBody); err != nil { + err = fmt.Errorf("failed to parse JSON object: %v", err) + errChan <- err + close(responseChan) + return + } + resp.StatusCode = response.StatusCode + resp.Status = response.Status + close(errChan) + responseChan <- resp +} + +// PollJobStatus polls the job status and handles the response/error channels. +func (c *SerpClientAsync) PollJobStatus(jobID string, + parse bool, + responseChan chan *Response, + errChan chan error, +) { + // Setting start time to check for timeout. + startNow := time.Now() + + for { + // Perform request to query job status. + request, _ := http.NewRequest( + "GET", + fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", jobID), + nil, + ) + request.Header.Add("Content-type", "application/json") + request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) + response, err := c.HttpClient.Do(request) + if err != nil { + errChan <- err + close(responseChan) + return + } + + // Read the response body into a buffer. + responseBody, err := io.ReadAll(response.Body) + response.Body.Close() + if err != nil { + err = fmt.Errorf("error reading response body: %v", err) + errChan <- err + close(responseChan) + return + } + + // Unmarshal into job. + job := &Job{} + if err = json.Unmarshal(responseBody, &job); err != nil { + err = fmt.Errorf("error unmarshalling Job response body: %v", err) + errChan <- err + close(responseChan) + return + } + + // Check job status. + if job.Status == "done" { + c.GetResponse(job.ID, parse, responseChan, errChan) + return + } else if job.Status == "faulted" { + err = fmt.Errorf("There was an error processing your query") + errChan <- err + close(responseChan) + return + } + + // Check for timeout. + if time.Since(startNow) > oxylabs.DefaultTimeout { + err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) + errChan <- err + close(responseChan) + return + } + + time.Sleep(oxylabs.DefaultWaitTime) + } +} diff --git a/serp/baidu_async.go b/serp/baidu_async.go index 1540a60..221aae6 100644 --- a/serp/baidu_async.go +++ b/serp/baidu_async.go @@ -1,12 +1,8 @@ package serp import ( - "bytes" "encoding/json" "fmt" - "io" - "net/http" - "time" "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) @@ -54,123 +50,14 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - err = json.Unmarshal(responseBody, &job) - if err != nil { - return nil, fmt.Errorf("error unmarshalling response body: %v", err) - } - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -221,120 +108,14 @@ func (c *SerpClientAsync) ScrapeBaiduUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { diff --git a/serp/bing.go b/serp/bing.go index 7c65e26..fd8eb10 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -91,7 +91,7 @@ func (c *SerpClient) ScrapeBingSearch( "pages": opt.Pages, "limit": opt.Limit, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "callback_url": opt.CallbackUrl, "render": opt.Render, @@ -147,7 +147,7 @@ func (c *SerpClient) ScrapeBingUrl( "source": "bing", "url": url, "user_agent_type": opt.UserAgent, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "render": opt.Render, "callback_url": opt.CallbackUrl, } diff --git a/serp/bing_async.go b/serp/bing_async.go index 47318d8..b2b119c 100644 --- a/serp/bing_async.go +++ b/serp/bing_async.go @@ -1,12 +1,8 @@ package serp import ( - "bytes" "encoding/json" "fmt" - "io" - "net/http" - "time" "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) @@ -47,7 +43,7 @@ func (c *SerpClientAsync) ScrapeBingSearch( "pages": opt.Pages, "limit": opt.Limit, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "callback_url": opt.CallbackUrl, "render": opt.Render, @@ -57,121 +53,14 @@ func (c *SerpClientAsync) ScrapeBingSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -215,7 +104,7 @@ func (c *SerpClientAsync) ScrapeBingUrl( "source": "bing", "url": url, "user_agent_type": opt.UserAgent, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "render": opt.Render, "callback_url": opt.CallbackUrl, } @@ -224,121 +113,14 @@ func (c *SerpClientAsync) ScrapeBingUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { diff --git a/serp/google.go b/serp/google.go index 9d68ff7..2438380 100644 --- a/serp/google.go +++ b/serp/google.go @@ -219,7 +219,7 @@ func (c *SerpClient) ScrapeGoogleSearch( "source": "google_search", "domain": opt.Domain, "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -266,7 +266,7 @@ func (c *SerpClient) ScrapeGoogleSearch( "start_page": opt.StartPage, "pages": opt.Pages, "limit": opt.Limit, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -357,7 +357,7 @@ func (c *SerpClient) ScrapeGoogleUrl( "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackUrl, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "parse": opt.Parse, } jsonPayload, err := json.Marshal(payload) @@ -434,7 +434,7 @@ func (c *SerpClient) ScrapeGoogleAds( "source": "google_search", "domain": opt.Domain, "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -502,7 +502,7 @@ func (c *SerpClient) ScrapeGoogleSuggestions( payload := map[string]interface{}{ "source": "google_suggestions", "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackUrl, @@ -574,7 +574,7 @@ func (c *SerpClient) ScrapeGoogleHotels( "limit": opt.Limit, "locale": opt.Locale, "results_language": opt.ResultsLanguage, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackURL, @@ -657,7 +657,7 @@ func (c *SerpClient) ScrapeGoogleTravelHotels( "pages": opt.Pages, "limit": opt.Limit, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackURL, @@ -743,7 +743,7 @@ func (c *SerpClient) ScrapeGoogleImages( "start_page": opt.StartPage, "pages": opt.Pages, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackURL, @@ -808,7 +808,7 @@ func (c *SerpClient) ScrapeGoogleTrendsExplore( payload := map[string]interface{}{ "source": "google_trends_explore", "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "context": []map[string]interface{}{ { "key": "search_type", diff --git a/serp/google_async.go b/serp/google_async.go index 4e3c88c..732201a 100644 --- a/serp/google_async.go +++ b/serp/google_async.go @@ -1,12 +1,8 @@ package serp import ( - "bytes" "encoding/json" "fmt" - "io" - "net/http" - "time" "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) @@ -58,7 +54,7 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( "source": "google_search", "domain": opt.Domain, "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -105,7 +101,7 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( "start_page": opt.StartPage, "pages": opt.Pages, "limit": opt.Limit, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -147,121 +143,14 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - resp.Parse = opt.Parse - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, opt.Parse, responseChan, errChan) err = <-errChan if err != nil { @@ -307,7 +196,7 @@ func (c *SerpClientAsync) ScrapeGoogleUrl( "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackUrl, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "parse": opt.Parse, } jsonPayload, err := json.Marshal(payload) @@ -315,121 +204,14 @@ func (c *SerpClientAsync) ScrapeGoogleUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - resp.Parse = opt.Parse - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, opt.Parse, responseChan, errChan) err = <-errChan if err != nil { @@ -476,7 +258,7 @@ func (c *SerpClientAsync) ScrapeGoogleAds( "source": "google_search", "domain": opt.Domain, "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "parse": opt.Parse, "render": opt.Render, @@ -504,121 +286,14 @@ func (c *SerpClientAsync) ScrapeGoogleAds( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - resp.Parse = opt.Parse - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, opt.Parse, responseChan, errChan) err = <-errChan if err != nil { @@ -655,7 +330,7 @@ func (c *SerpClientAsync) ScrapeGoogleSuggestions( payload := map[string]interface{}{ "source": "google_suggestions", "query": query, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackUrl, @@ -665,120 +340,14 @@ func (c *SerpClientAsync) ScrapeGoogleSuggestions( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -831,7 +400,7 @@ func (c *SerpClientAsync) ScrapeGoogleHotels( "limit": opt.Limit, "locale": opt.Locale, "results_language": opt.ResultsLanguage, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackURL, @@ -855,120 +424,14 @@ func (c *SerpClientAsync) ScrapeGoogleHotels( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -1019,7 +482,7 @@ func (c *SerpClientAsync) ScrapeGoogleTravelHotels( "pages": opt.Pages, "limit": opt.Limit, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackURL, @@ -1043,120 +506,14 @@ func (c *SerpClientAsync) ScrapeGoogleTravelHotels( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -1211,7 +568,7 @@ func (c *SerpClientAsync) ScrapeGoogleImages( "start_page": opt.StartPage, "pages": opt.Pages, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "render": opt.Render, "callback_url": opt.CallbackURL, @@ -1231,120 +588,14 @@ func (c *SerpClientAsync) ScrapeGoogleImages( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -1414,125 +665,14 @@ func (c *SerpClientAsync) ScrapeGoogleTrendsExplore( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - - if response.StatusCode == 400 { - return nil, fmt.Errorf("error with status code %v: %v", response.StatusCode, string(responseBody)) - } - - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { diff --git a/serp/yandex.go b/serp/yandex.go index 32a2145..8b1d238 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -103,7 +103,7 @@ func (c *SerpClient) ScrapeYandexSearch( "pages": opt.Pages, "limit": opt.Limit, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "callback_url": opt.CallbackUrl, } diff --git a/serp/yandex_async.go b/serp/yandex_async.go index 50e1409..3df3000 100644 --- a/serp/yandex_async.go +++ b/serp/yandex_async.go @@ -1,12 +1,8 @@ package serp import ( - "bytes" "encoding/json" "fmt" - "io" - "net/http" - "time" "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) @@ -47,7 +43,7 @@ func (c *SerpClientAsync) ScrapeYandexSearch( "pages": opt.Pages, "limit": opt.Limit, "locale": opt.Locale, - "geo_location": opt.GeoLocation, + "geo_location": &opt.GeoLocation, "user_agent_type": opt.UserAgent, "callback_url": opt.CallbackUrl, } @@ -56,120 +52,14 @@ func (c *SerpClientAsync) ScrapeYandexSearch( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, err = io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { @@ -221,114 +111,14 @@ func (c *SerpClientAsync) ScrapeYandexUrl( return nil, fmt.Errorf("error marshalling payload: %v", err) } - request, _ := http.NewRequest( - "POST", - c.BaseUrl, - bytes.NewBuffer(jsonPayload), - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err := c.HttpClient.Do(request) + // Get job ID. + jobID, err := c.GetJobID(jsonPayload) if err != nil { return nil, err } - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - response.Body.Close() - - // Unmarshal into job. - job := &Job{} - json.Unmarshal(responseBody, &job) - - go func() { - startNow := time.Now() - - for { - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", job.ID), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - responseBody, _ = io.ReadAll(response.Body) - response.Body.Close() - - json.Unmarshal(responseBody, &job) - - if job.Status == "done" { - JobId := job.ID - request, _ = http.NewRequest( - "GET", - fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId), - nil, - ) - request.Header.Add("Content-type", "application/json") - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - response, err = c.HttpClient.Do(request) - if err != nil { - errChan <- err - close(responseChan) - return - } - - // Read the response body into a buffer. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - err = fmt.Errorf("error reading response body: %v", err) - errChan <- err - close(responseChan) - return - } - response.Body.Close() - - // Send back error message. - if response.StatusCode != 200 { - err = fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - errChan <- err - close(responseChan) - return - } - - // Unmarshal the JSON object. - resp := &Response{} - if err := resp.UnmarshalJSON(responseBody); err != nil { - err = fmt.Errorf("failed to parse JSON object: %v", err) - errChan <- err - close(responseChan) - return - } - resp.StatusCode = response.StatusCode - resp.Status = response.Status - close(errChan) - responseChan <- resp - } else if job.Status == "faulted" { - err = fmt.Errorf("There was an error processing your query") - errChan <- err - close(responseChan) - return - } - - if time.Since(startNow) > oxylabs.DefaultTimeout { - err = fmt.Errorf("timeout exceeded: %v", oxylabs.DefaultTimeout) - errChan <- err - close(responseChan) - return - } - - time.Sleep(oxylabs.DefaultWaitTime) - } - }() + // Poll job status. + go c.PollJobStatus(jobID, false, responseChan, errChan) err = <-errChan if err != nil { From 179164d3d8dab26e2007eba1d37e35b627f6cf36 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 19 Dec 2023 18:21:30 +0500 Subject: [PATCH 19/27] update creating payload in google_search funcs --- serp/google.go | 133 +++++++++++++++---------------------------- serp/google_async.go | 133 +++++++++++++++---------------------------- 2 files changed, 90 insertions(+), 176 deletions(-) diff --git a/serp/google.go b/serp/google.go index 2438380..98f94ba 100644 --- a/serp/google.go +++ b/serp/google.go @@ -210,97 +210,54 @@ func (c *SerpClient) ScrapeGoogleSearch( return nil, err } - // Prepare payload. - var payload map[string]interface{} + // Prepare payload with common parameters. + payload := map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "geo_location": &opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "filter", + "value": context["filter"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "safe_search", + "value": context["safe_search"], + }, + { + "key": "fpstate", + "value": context["fpstate"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } - // If user sends limit_per_page context parameter, use it instead of limit, start_page and pages parameters. + // If user sends limit_per_page context parameter, use it instead of limit, start_page, and pages parameters. if context["limit_per_page"] != nil { - payload = map[string]interface{}{ - "source": "google_search", - "domain": opt.Domain, - "query": query, - "geo_location": &opt.GeoLocation, - "user_agent_type": opt.UserAgent, - "parse": opt.Parse, - "render": opt.Render, - "context": []map[string]interface{}{ - { - "key": "results_language", - "value": context["results_language"], - }, - { - "key": "filter", - "value": context["filter"], - }, - { - "key": "limit_per_page", - "value": context["limit_per_page"], - }, - { - "key": "nfpr", - "value": context["nfpr"], - }, - { - "key": "safe_search", - "value": context["safe_search"], - }, - { - "key": "fpstate", - "value": context["fpstate"], - }, - { - "key": "tbm", - "value": context["tbm"], - }, - { - "key": "tbs", - "value": context["tbs"], - }, - }, - } + payload["limit_per_page"] = context["limit_per_page"] } else { - payload = map[string]interface{}{ - "source": "google_search", - "domain": opt.Domain, - "query": query, - "start_page": opt.StartPage, - "pages": opt.Pages, - "limit": opt.Limit, - "geo_location": &opt.GeoLocation, - "user_agent_type": opt.UserAgent, - "parse": opt.Parse, - "render": opt.Render, - "context": []map[string]interface{}{ - { - "key": "results_language", - "value": context["results_language"], - }, - { - "key": "filter", - "value": context["filter"], - }, - { - "key": "nfpr", - "value": context["nfpr"], - }, - { - "key": "safe_search", - "value": context["safe_search"], - }, - { - "key": "fpstate", - "value": context["fpstate"], - }, - { - "key": "tbm", - "value": context["tbm"], - }, - { - "key": "tbs", - "value": context["tbs"], - }, - }, - } + payload["start_page"] = opt.StartPage + payload["pages"] = opt.Pages + payload["limit"] = opt.Limit } jsonPayload, err := json.Marshal(payload) diff --git a/serp/google_async.go b/serp/google_async.go index 732201a..e7c91bd 100644 --- a/serp/google_async.go +++ b/serp/google_async.go @@ -45,97 +45,54 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( return nil, err } - // Prepare payload. - var payload map[string]interface{} + // Prepare payload with common parameters. + payload := map[string]interface{}{ + "source": "google_search", + "domain": opt.Domain, + "query": query, + "geo_location": &opt.GeoLocation, + "user_agent_type": opt.UserAgent, + "parse": opt.Parse, + "render": opt.Render, + "context": []map[string]interface{}{ + { + "key": "results_language", + "value": context["results_language"], + }, + { + "key": "filter", + "value": context["filter"], + }, + { + "key": "nfpr", + "value": context["nfpr"], + }, + { + "key": "safe_search", + "value": context["safe_search"], + }, + { + "key": "fpstate", + "value": context["fpstate"], + }, + { + "key": "tbm", + "value": context["tbm"], + }, + { + "key": "tbs", + "value": context["tbs"], + }, + }, + } - // If user sends limit_per_page context parameter, use it instead of limit, start_page and pages parameters. + // If user sends limit_per_page context parameter, use it instead of limit, start_page, and pages parameters. if context["limit_per_page"] != nil { - payload = map[string]interface{}{ - "source": "google_search", - "domain": opt.Domain, - "query": query, - "geo_location": &opt.GeoLocation, - "user_agent_type": opt.UserAgent, - "parse": opt.Parse, - "render": opt.Render, - "context": []map[string]interface{}{ - { - "key": "results_language", - "value": context["results_language"], - }, - { - "key": "filter", - "value": context["filter"], - }, - { - "key": "limit_per_page", - "value": context["limit_per_page"], - }, - { - "key": "nfpr", - "value": context["nfpr"], - }, - { - "key": "safe_search", - "value": context["safe_search"], - }, - { - "key": "fpstate", - "value": context["fpstate"], - }, - { - "key": "tbm", - "value": context["tbm"], - }, - { - "key": "tbs", - "value": context["tbs"], - }, - }, - } + payload["limit_per_page"] = context["limit_per_page"] } else { - payload = map[string]interface{}{ - "source": "google_search", - "domain": opt.Domain, - "query": query, - "start_page": opt.StartPage, - "pages": opt.Pages, - "limit": opt.Limit, - "geo_location": &opt.GeoLocation, - "user_agent_type": opt.UserAgent, - "parse": opt.Parse, - "render": opt.Render, - "context": []map[string]interface{}{ - { - "key": "results_language", - "value": context["results_language"], - }, - { - "key": "filter", - "value": context["filter"], - }, - { - "key": "nfpr", - "value": context["nfpr"], - }, - { - "key": "safe_search", - "value": context["safe_search"], - }, - { - "key": "fpstate", - "value": context["fpstate"], - }, - { - "key": "tbm", - "value": context["tbm"], - }, - { - "key": "tbs", - "value": context["tbs"], - }, - }, - } + payload["start_page"] = opt.StartPage + payload["pages"] = opt.Pages + payload["limit"] = opt.Limit } jsonPayload, err := json.Marshal(payload) From ce1b8ce94073ba7e0360f16da3fafc5b5192c995 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Tue, 19 Dec 2023 19:23:43 +0500 Subject: [PATCH 20/27] update public func comments --- serp/baidu.go | 10 ++++--- serp/baidu_async.go | 6 ++-- serp/bing.go | 10 ++++--- serp/bing_async.go | 6 ++-- serp/google.go | 64 ++++++++++++++++++++++++------------------- serp/google_async.go | 24 ++++++++++------ serp/proxy_endpint.go | 6 ++-- serp/yandex.go | 18 ++++++------ serp/yandex_async.go | 6 ++-- 9 files changed, 90 insertions(+), 60 deletions(-) diff --git a/serp/baidu.go b/serp/baidu.go index 3bbba7d..245eb8f 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -13,7 +13,7 @@ var BaiduSearchAcceptedDomainParameters = []oxylabs.Domain{ oxylabs.DOMAIN_CN, } -// checkParameterValidity checks validity of baidu search parameters. +// checkParameterValidity checks validity of ScrapeBaiduSearch parameters. func (opt *BaiduSearchOpts) checkParameterValidity() error { if !oxylabs.InList(opt.Domain, BaiduSearchAcceptedDomainParameters) { return fmt.Errorf("invalid domain parameter: %s", opt.Domain) @@ -30,7 +30,7 @@ func (opt *BaiduSearchOpts) checkParameterValidity() error { return nil } -// checkParameterValidity checks validity of baidu url parameters. +// checkParameterValidity checks validity of ScrapeBaiduUrl parameters. func (opt *BaiduUrlOpts) checkParameterValidity() error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -39,6 +39,7 @@ func (opt *BaiduUrlOpts) checkParameterValidity() error { return nil } +// BaiduSearchOpts contains all the query paramaters available for baidu_search. type BaiduSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -48,7 +49,7 @@ type BaiduSearchOpts struct { CallbackUrl string } -// ScrapeBaiduSearch scrapes baidu with baidu_search as source. +// ScrapeBaiduSearch scrapes baidu via Oxylabs SERP API with baidu_search as source. func (c *SerpClient) ScrapeBaiduSearch( query string, opts ...*BaiduSearchOpts, @@ -96,12 +97,13 @@ func (c *SerpClient) ScrapeBaiduSearch( return res, nil } +// BaiduUrlOpts contains all the query paramaters available for baidu. type BaiduUrlOpts struct { UserAgent oxylabs.UserAgent CallbackUrl string } -// ScrapeBaiduUrl scrapes baidu with baidu as source. +// ScrapeBaiduUrl scrapes baidu via Oxylabs SERP API with baidu as source. func (c *SerpClient) ScrapeBaiduUrl( url string, opts ...*BaiduUrlOpts, diff --git a/serp/baidu_async.go b/serp/baidu_async.go index 221aae6..97faf46 100644 --- a/serp/baidu_async.go +++ b/serp/baidu_async.go @@ -7,7 +7,8 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// ScrapeBingSearch scrapes bing with bing_search as source with async polling runtime. +// ScrapeBingSearch scrapes bing with async polling runtime via Oxylabs SERP API +// and bing_search as source. func (c *SerpClientAsync) ScrapeBaiduSearch( query string, opts ...*BaiduSearchOpts, @@ -67,7 +68,8 @@ func (c *SerpClientAsync) ScrapeBaiduSearch( return responseChan, nil } -// ScrapeBingUrl scrapes bing with bing as source with async polling runtime. +// ScrapeBingUrl scrapes bing with async polling runtime via Oxylabs SERP API +// and bing as source. func (c *SerpClientAsync) ScrapeBaiduUrl( url string, opts ...*BaiduUrlOpts, diff --git a/serp/bing.go b/serp/bing.go index fd8eb10..bb23939 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -17,7 +17,7 @@ var BingSearchAcceptedDomainParameters = []oxylabs.Domain{ oxylabs.DOMAIN_TR, } -// checkParameterValidity checks validity of bing search parameters. +// checkParameterValidity checks validity of ScrapeBingSearch parameters. func (opt *BingSearchOpts) checkParameterValidity() error { if opt.Domain != "" && !oxylabs.InList(opt.Domain, BingSearchAcceptedDomainParameters) { return fmt.Errorf("invalid domain parameter: %s", opt.Domain) @@ -33,7 +33,7 @@ func (opt *BingSearchOpts) checkParameterValidity() error { return nil } -// checkParameterValidity checks validity of bing url parameters. +// checkParameterValidity checks validity of ScrapeBingUrl parameters. func (opt *BingUrlOpts) checkParameterValidity() error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -46,6 +46,7 @@ func (opt *BingUrlOpts) checkParameterValidity() error { return nil } +// BingSearchOpts contains all the query paramaters available for bing_search. type BingSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -58,7 +59,7 @@ type BingSearchOpts struct { Render oxylabs.Render } -// ScraperBingSearch scrapes bing with bing_search as source. +// ScrapeBingSearch scrapes bing via Oxylabs SERP API with bing_search as source. func (c *SerpClient) ScrapeBingSearch( query string, opts ...*BingSearchOpts, @@ -109,6 +110,7 @@ func (c *SerpClient) ScrapeBingSearch( return res, nil } +// BingUrlOpts contains all the query paramaters available for bing. type BingUrlOpts struct { UserAgent oxylabs.UserAgent GeoLocation *string @@ -116,7 +118,7 @@ type BingUrlOpts struct { CallbackUrl string } -// ScrapeBingUrl scrapes bing with bing as source. +// ScrapeBingUrl scrapes bing via Oxylabs SERP API with bing as source. func (c *SerpClient) ScrapeBingUrl( url string, opts ...*BingUrlOpts, diff --git a/serp/bing_async.go b/serp/bing_async.go index b2b119c..80b9017 100644 --- a/serp/bing_async.go +++ b/serp/bing_async.go @@ -7,7 +7,8 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// ScrapeBingSearch scrapes bing with bing_search as source with async polling runtime. +// ScrapeBingSearch scrapes bing with async polling runtime via Oxylabs SERP API +// and bing_search as source. func (c *SerpClientAsync) ScrapeBingSearch( query string, opts ...*BingSearchOpts, @@ -70,7 +71,8 @@ func (c *SerpClientAsync) ScrapeBingSearch( return responseChan, nil } -// ScrapeBingUrl scrapes bing with bing as source with async polling runtime. +// ScrapeBingUrl scrapes bing with async polling runtime via Oxylabs SERP API +// and bing as source. func (c *SerpClientAsync) ScrapeBingUrl( url string, opts ...*BingUrlOpts, diff --git a/serp/google.go b/serp/google.go index 98f94ba..f0960a0 100644 --- a/serp/google.go +++ b/serp/google.go @@ -27,7 +27,7 @@ var AcceptedSearchTypeParameters = []string{ "youtube_search", } -// checkParameterValidity checks validity of google search parameters. +// checkParameterValidity checks validity of ScrapeGoogleSearch parameters. func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -48,7 +48,7 @@ func (opt *GoogleSearchOpts) checkParameterValidity(ctx ContextOption) error { return nil } -// checkParameterValidity checks validity of google url parameters. +// checkParameterValidity checks validity of ScrapeGoogleUrl parameters. func (opt *GoogleUrlOpts) checkParameterValidity() error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -61,7 +61,7 @@ func (opt *GoogleUrlOpts) checkParameterValidity() error { return nil } -// checkParameterValidity checks validity of google ads parameters. +// checkParameterValidity checks validity of ScrapeGoogleAds parameters. func (opt *GoogleAdsOpts) checkParameterValidity(ctx ContextOption) error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -82,7 +82,7 @@ func (opt *GoogleAdsOpts) checkParameterValidity(ctx ContextOption) error { return nil } -// checkParameterValidity checks validity of google suggestions parameters. +// checkParameterValidity checks validity of ScrapeGoogleSuggestions parameters. func (opt *GoogleSuggestionsOpts) checkParameterValidity() error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -95,7 +95,7 @@ func (opt *GoogleSuggestionsOpts) checkParameterValidity() error { return nil } -// checkParameterValidity checks validity of google hotels parameters. +// checkParameterValidity checks validity of ScrapeGoogleHotels parameters. func (opt *GoogleHotelsOpts) checkParameterValidity(ctx ContextOption) error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -116,7 +116,7 @@ func (opt *GoogleHotelsOpts) checkParameterValidity(ctx ContextOption) error { return nil } -// checkParameterValidity checks validity of google travel hotels parameters. +// checkParameterValidity checks validity of ScrapeGoogleTravelHotels parameters. func (opt *GoogleTravelHotelsOpts) checkParameterValidity(ctx ContextOption) error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -145,7 +145,7 @@ func (opt *GoogleTravelHotelsOpts) checkParameterValidity(ctx ContextOption) err return nil } -// checkParameterValidity checks validity of google trends explore parameters. +// checkParameterValidity checks validity of ScrapeGoogleTrendsExplore parameters. func (opt *GoogleTrendsExploreOpts) checkParameterValidity(ctx ContextOption) error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -162,6 +162,21 @@ func (opt *GoogleTrendsExploreOpts) checkParameterValidity(ctx ContextOption) er return nil } +// checkParameterValidity checks validity of google images parameters. +func (opt *GoogleImagesOpts) checkParameterValidity(ctx ContextOption) error { + + if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { + return fmt.Errorf("invalid render parameter: %v", opt.Render) + } + + if opt.Pages <= 0 || opt.StartPage <= 0 { + return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") + } + + return nil +} + +// GoogleSearchOpts contains all the query parameters available for google_search. type GoogleSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -175,7 +190,7 @@ type GoogleSearchOpts struct { Context []func(ContextOption) } -// Scrapes Google via its search engine. +// ScrapeGoogleSearch scrapes google via Oxylabs SERP API with google_search as source. func (c *SerpClient) ScrapeGoogleSearch( query string, opts ...*GoogleSearchOpts, @@ -273,6 +288,7 @@ func (c *SerpClient) ScrapeGoogleSearch( return res, nil } +// GoogleUrlOpts contains all the query parameters available for google. type GoogleUrlOpts struct { GeoLocation *string UserAgent oxylabs.UserAgent @@ -281,7 +297,7 @@ type GoogleUrlOpts struct { CallbackUrl string } -// ScrapeGoogleUrl scrapes google vith google as source. +// ScrapeGoogleUrl scrapes google via Oxylabs SERP API with google as source. func (c *SerpClient) ScrapeGoogleUrl( url string, opts ...*GoogleUrlOpts, @@ -330,20 +346,7 @@ func (c *SerpClient) ScrapeGoogleUrl( return res, nil } -// checkParameterValidity checks validity of google images parameters. -func (opt *GoogleImagesOpts) checkParameterValidity(ctx ContextOption) error { - - if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { - return fmt.Errorf("invalid render parameter: %v", opt.Render) - } - - if opt.Pages <= 0 || opt.StartPage <= 0 { - return fmt.Errorf("limit, pages and start_page parameters must be greater than 0") - } - - return nil -} - +// GoogleAdsOpts contains all the query parameters available for google_ads. type GoogleAdsOpts struct { Domain oxylabs.Domain StartPage int @@ -357,7 +360,7 @@ type GoogleAdsOpts struct { Context []func(ContextOption) } -// SrcapeGoogleAds scrapes google via the google_ads source. +// SrcapeGoogleAds scrapes google via Oxylabs SERP API with google_ads as source. func (c *SerpClient) ScrapeGoogleAds( query string, opts ...*GoogleAdsOpts, @@ -427,6 +430,7 @@ func (c *SerpClient) ScrapeGoogleAds( return res, nil } +// GoogleShoppingOpts contains all the query parameters available for google_shopping. type GoogleSuggestionsOpts struct { Locale string GeoLocation *string @@ -435,7 +439,7 @@ type GoogleSuggestionsOpts struct { CallbackUrl string } -// ScrapeGoogleSuggestions scrapes google via the google_suggestions source. +// ScrapeGoogleSuggestions scrapes google via Oxylabs SERP API with google_suggestions as source. func (c *SerpClient) ScrapeGoogleSuggestions( query string, opts ...*GoogleSuggestionsOpts, @@ -477,6 +481,7 @@ func (c *SerpClient) ScrapeGoogleSuggestions( return res, nil } +// GoogleHotelsOpts contains all the query parameters available for google_hotels. type GoogleHotelsOpts struct { Domain oxylabs.Domain StartPage int @@ -563,6 +568,7 @@ func (c *SerpClient) ScrapeGoogleHotels( return res, nil } +// GoogleTravelHotelsOpts contains all the query parameters available for google_travel_hotels. type GoogleTravelHotelsOpts struct { Domain oxylabs.Domain StartPage int @@ -576,7 +582,7 @@ type GoogleTravelHotelsOpts struct { Context []func(ContextOption) } -// ScrapeGoogleTravelHotels scrapes google via the google_travel_hotels source. +// ScrapeGoogleTravelHotels scrapes google via Oxylabs SERP API with google_travel_hotels as source. func (c *SerpClient) ScrapeGoogleTravelHotels( query string, opts ...*GoogleTravelHotelsOpts, @@ -646,6 +652,7 @@ func (c *SerpClient) ScrapeGoogleTravelHotels( return res, nil } +// GoogleImagesOpts contains all the query parameters available for google_images. type GoogleImagesOpts struct { Domain oxylabs.Domain StartPage int @@ -658,7 +665,7 @@ type GoogleImagesOpts struct { Context []func(ContextOption) } -// ScrapeGoogleImages scrapes google via the google_images source. +// ScrapeGoogleImages scrapes google via Oxylabs SERP API with google_images as source. func (c *SerpClient) ScrapeGoogleImages( url string, opts ...*GoogleImagesOpts, @@ -728,6 +735,7 @@ func (c *SerpClient) ScrapeGoogleImages( return res, nil } +// GoogleTrendsExploreOpts contains all the query parameters available for google_trends_explore. type GoogleTrendsExploreOpts struct { GeoLocation *string Context []func(ContextOption) @@ -735,7 +743,7 @@ type GoogleTrendsExploreOpts struct { CallbackURL string } -// ScrapeGoogleTrendsExplore scrapes google via the google_trends_explore source. +// ScrapeGoogleTrendsExplore scrapes google via Oxylabs SERP API with google_trends_explore as source. func (c *SerpClient) ScrapeGoogleTrendsExplore( query string, opts ...*GoogleTrendsExploreOpts, diff --git a/serp/google_async.go b/serp/google_async.go index e7c91bd..ecb79c6 100644 --- a/serp/google_async.go +++ b/serp/google_async.go @@ -7,7 +7,8 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// ScrapeGoogleSearch scrapes google with google_search as source with async polling runtime. +// ScrapeGoogleSearch scrapes google with async polling runtime via Oxylabs SERP API +// and google_search as source. func (c *SerpClientAsync) ScrapeGoogleSearch( query string, opts ...*GoogleSearchOpts, @@ -117,7 +118,8 @@ func (c *SerpClientAsync) ScrapeGoogleSearch( return responseChan, nil } -// ScrapeGoogleUrl scrapes google with google as source with async polling runtime. +// ScrapeGoogleUrl scrapes google with async polling runtime via Oxylabs SERP API +// and google as source. func (c *SerpClientAsync) ScrapeGoogleUrl( url string, opts ...*GoogleUrlOpts, @@ -178,7 +180,8 @@ func (c *SerpClientAsync) ScrapeGoogleUrl( return responseChan, nil } -// ScrapeGoogleAds scrapes google with google_ads as source with async polling runtime. +// ScrapeGoogleAds scrapes google with async polling runtime via Oxylabs SERP API +// and google_ads as source. func (c *SerpClientAsync) ScrapeGoogleAds( query string, opts ...*GoogleAdsOpts, @@ -260,7 +263,8 @@ func (c *SerpClientAsync) ScrapeGoogleAds( return responseChan, nil } -// ScrapeGoogleSuggestions scrapes google with google_suggestions as source with async polling runtime. +// ScrapeGoogleSuggestions scrapes google with async polling runtime via Oxylabs SERP API +// and google_suggestions as source func (c *SerpClientAsync) ScrapeGoogleSuggestions( query string, opts ...*GoogleSuggestionsOpts, @@ -314,7 +318,8 @@ func (c *SerpClientAsync) ScrapeGoogleSuggestions( return responseChan, nil } -// ScrapeGoogleTravelHotels scrapes google with google_hotels as source with async polling runtime. +// ScrapeGoogleTHotels scrapes google with async polling runtime via Oxylabs SERP API +// and google_hotels as source. func (c *SerpClientAsync) ScrapeGoogleHotels( query string, opts ...*GoogleHotelsOpts, @@ -398,7 +403,8 @@ func (c *SerpClientAsync) ScrapeGoogleHotels( return responseChan, nil } -// ScrapeGoogleTravelHotels scrapes google with google_travel_hotels as source with async polling runtime. +// ScrapeGoogleTravelHotels scrapes google with async polling runtime via Oxylabs SERP API +// and google_travel_hotels as source. func (c *SerpClientAsync) ScrapeGoogleTravelHotels( query string, opts ...*GoogleTravelHotelsOpts, @@ -480,7 +486,8 @@ func (c *SerpClientAsync) ScrapeGoogleTravelHotels( return responseChan, nil } -// ScrapeGoogleImages scrapes google with google_images as source with async polling runtime. +// ScrapeGoogleImages scrapes google with async polling runtime via Oxylabs SERP API +// and google_images as source. func (c *SerpClientAsync) ScrapeGoogleImages( url string, opts ...*GoogleImagesOpts, @@ -562,7 +569,8 @@ func (c *SerpClientAsync) ScrapeGoogleImages( return responseChan, nil } -// ScrapeGoogleTrendsExplore scrapes google with google_trends_explore as source with async polling runtime. +// ScrapeGoogleTrendsExplore scrapes google with async polling runtime via Oxylabs SERP API +// and google_trends_explore as source. func (c *SerpClientAsync) ScrapeGoogleTrendsExplore( query string, opts ...*GoogleTrendsExploreOpts, diff --git a/serp/proxy_endpint.go b/serp/proxy_endpint.go index 010d8c4..210c2cd 100644 --- a/serp/proxy_endpint.go +++ b/serp/proxy_endpint.go @@ -9,6 +9,8 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// ScrapeProxyOpts contains all the query paramaters available when scraping +// via Oxylabs proxy endpoint. type ScrapeProxyOpts struct { UserAgent oxylabs.UserAgent GeoLocation string @@ -17,7 +19,7 @@ type ScrapeProxyOpts struct { CustomHeaders map[string]string } -// checkParameterValidity checks validity of google search parameters. +// checkParameterValidity checks validity of ScrapeProxyUrl parameters. func (opt *ScrapeProxyOpts) checkParameterValidity() error { if opt.UserAgent != "" && !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -30,7 +32,7 @@ func (opt *ScrapeProxyOpts) checkParameterValidity() error { return nil } -// ScrapeProxyUrl scrapes via proxy endpoint. +// ScrapeProxyUrl scrapes a URL via Oxylabs proxy endpoint. func (c *SerpClientProxy) ScrapeProxyUrl( url string, opts ...*ScrapeProxyOpts, diff --git a/serp/yandex.go b/serp/yandex.go index 8b1d238..b649c15 100644 --- a/serp/yandex.go +++ b/serp/yandex.go @@ -8,7 +8,7 @@ import ( ) // Accepted parameters for yandex. -var yandexSearchAcceptedDomainParameters = []oxylabs.Domain{ +var YandexSearchAcceptedDomainParameters = []oxylabs.Domain{ oxylabs.DOMAIN_COM, oxylabs.DOMAIN_RU, oxylabs.DOMAIN_UA, @@ -16,7 +16,7 @@ var yandexSearchAcceptedDomainParameters = []oxylabs.Domain{ oxylabs.DOMAIN_KZ, oxylabs.DOMAIN_TR, } -var yandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ +var YandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ oxylabs.LOCALE_EN, oxylabs.LOCALE_RU, oxylabs.LOCALE_BY, @@ -29,13 +29,13 @@ var yandexSearchAcceptedLocaleParameters = []oxylabs.Locale{ oxylabs.LOCALE_UK, } -// checkParameterValidity checks validity of yandex search parameters. +// checkParameterValidity checks validity of ScrapeYandexSearch parameters. func (opt *YandexSearchOpts) checkParameterValidity() error { - if !oxylabs.InList(opt.Domain, yandexSearchAcceptedDomainParameters) { + if !oxylabs.InList(opt.Domain, YandexSearchAcceptedDomainParameters) { return fmt.Errorf("invalid domain parameter: %s", opt.Domain) } - if opt.Locale != "" && !oxylabs.InList(opt.Locale, yandexSearchAcceptedLocaleParameters) { + if opt.Locale != "" && !oxylabs.InList(opt.Locale, YandexSearchAcceptedLocaleParameters) { return fmt.Errorf("invalid locale parameter: %s", opt.Locale) } @@ -46,7 +46,7 @@ func (opt *YandexSearchOpts) checkParameterValidity() error { return nil } -// checkParameterValidity checks validity of yandex url parameters. +// checkParameterValidity checks validity of ScrapeYandexUrl parameters. func (opt *YandexUrlOpts) checkParameterValidity() error { if !oxylabs.IsUserAgentValid(opt.UserAgent) { return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) @@ -59,6 +59,7 @@ func (opt *YandexUrlOpts) checkParameterValidity() error { return nil } +// YandexSearchOpts contains all the query parameters available for yandex_search. type YandexSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -70,7 +71,7 @@ type YandexSearchOpts struct { CallbackUrl string } -// ScrapYandexSearch scrapes yandex with yandex_search as source. +// ScrapeYandexSearch scrapes yandex via Oxylabs SERP API with yandex_search as source. func (c *SerpClient) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, @@ -120,13 +121,14 @@ func (c *SerpClient) ScrapeYandexSearch( return res, nil } +// YandexUrlOpts contains all the query parameters available for yandex. type YandexUrlOpts struct { UserAgent oxylabs.UserAgent Render oxylabs.Render CallbackUrl string } -// ScapeYandexUrl scrapes yandex with yandex as source. +// ScapeYandexUrl scrapes a yandex url via Oxylabs SERP API with yandex as source. func (c *SerpClient) ScrapeYandexUrl( url string, opts ...*YandexUrlOpts, diff --git a/serp/yandex_async.go b/serp/yandex_async.go index 3df3000..641792b 100644 --- a/serp/yandex_async.go +++ b/serp/yandex_async.go @@ -7,7 +7,8 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// ScrapeYandexSearch scrapes yandex with yandex_search as source with async polling runtime. +// ScrapeYandexSearch scrapes yandex with async polling runtime via Oxylabs SERP API +// and yandex_search as source. func (c *SerpClientAsync) ScrapeYandexSearch( query string, opts ...*YandexSearchOpts, @@ -69,7 +70,8 @@ func (c *SerpClientAsync) ScrapeYandexSearch( return responseChan, nil } -// ScrapeYandexUrl scrapes yandex with yandex as source with async polling runtime. +// ScrapeYandexUrl scrapes yandex with async polling runtime via Oxylabs SERP API +// and yandex as source. func (c *SerpClientAsync) ScrapeYandexUrl( url string, opts ...*YandexUrlOpts, From 8fda0b671b066e56790b681d557be1130cd50fa8 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Wed, 20 Dec 2023 15:23:01 +0500 Subject: [PATCH 21/27] Apply suggestions from code review v4 --- proxy/client.go | 34 ++++++++++++ proxy/proxy_helpers.go | 26 ++++++++++ serp/async_helpers.go | 3 +- serp/client.go | 46 ----------------- serp/proxy_endpint.go | 114 ----------------------------------------- 5 files changed, 62 insertions(+), 161 deletions(-) create mode 100644 proxy/client.go create mode 100644 proxy/proxy_helpers.go delete mode 100644 serp/proxy_endpint.go diff --git a/proxy/client.go b/proxy/client.go new file mode 100644 index 0000000..ee0a0bc --- /dev/null +++ b/proxy/client.go @@ -0,0 +1,34 @@ +package proxy + +import ( + "crypto/tls" + "fmt" + "net/http" + "net/url" +) + +// Init for Proxy runtime model. +func Init( + username string, + password string, +) (*http.Client, error) { + // Prepare proxy url. + proxyUrl, err := url.Parse( + fmt.Sprintf( + "http://%s:%s@realtime.oxylabs.io:60000", + username, + password, + ), + ) + if err != nil { + return nil, fmt.Errorf("error parsing proxy url: %v", err) + } + + // Prepare custom transport. + customTransport := &http.Transport{Proxy: http.ProxyURL(proxyUrl)} + customTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + + client := &http.Client{Transport: customTransport} + + return client, nil +} diff --git a/proxy/proxy_helpers.go b/proxy/proxy_helpers.go new file mode 100644 index 0000000..049fe1f --- /dev/null +++ b/proxy/proxy_helpers.go @@ -0,0 +1,26 @@ +package proxy + +import ( + "net/http" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +// Helper functions to add headers to request object.s + +func AddGeoLocationHeader(request *http.Request, geoLocation string) { + request.Header.Add("x-oxylabs-geo-location", geoLocation) +} + +func AddUserAgentHeader(request *http.Request, userAgent oxylabs.UserAgent) { + request.Header.Add("x-oxylabs-user-agent-type", string(userAgent)) +} + +func AddRenderHeader(request *http.Request, render oxylabs.Render) { + request.Header.Add("x-oxylabs-render", string(render)) +} + +func AddParseHeader(request *http.Request, parser string) { + request.Header.Add("x-oxylabs-parse", "1") + request.Header.Add("x-oxylabs-parser", parser) +} diff --git a/serp/async_helpers.go b/serp/async_helpers.go index 37bae41..5674ad9 100644 --- a/serp/async_helpers.go +++ b/serp/async_helpers.go @@ -97,7 +97,8 @@ func (c *SerpClientAsync) GetResponse( } // PollJobStatus polls the job status and handles the response/error channels. -func (c *SerpClientAsync) PollJobStatus(jobID string, +func (c *SerpClientAsync) PollJobStatus( + jobID string, parse bool, responseChan chan *Response, errChan chan error, diff --git a/serp/client.go b/serp/client.go index a3028c3..49fb116 100644 --- a/serp/client.go +++ b/serp/client.go @@ -1,10 +1,7 @@ package serp import ( - "crypto/tls" - "fmt" "net/http" - "net/url" ) type ApiCredentials struct { @@ -53,46 +50,3 @@ func InitAsync( BaseUrl: "https://data.oxylabs.io/v1/queries", } } - -type SerpClientProxy struct { - HttpClient *http.Client - ApiCredentials *ApiCredentials -} - -// Init for Proxy runtime model. -func InitProxy( - username string, - password string, -) *SerpClientProxy { - // Prepare API credentials. - apiCredentials := &ApiCredentials{ - Username: username, - Password: password, - } - - // Prepare proxy url. - proxyUrl, err := url.Parse( - fmt.Sprintf( - "http://%s:%s@realtime.oxylabs.io:60000", - apiCredentials.Username, - apiCredentials.Password, - ), - ) - if err != nil { - fmt.Printf("error parsing proxy url: %v", err) - } - - // Prepare custom transport. - customTransport := &http.Transport{Proxy: http.ProxyURL(proxyUrl)} - customTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} - - client := &http.Client{Transport: customTransport} - - return &SerpClientProxy{ - ApiCredentials: &ApiCredentials{ - Username: username, - Password: password, - }, - HttpClient: client, - } -} diff --git a/serp/proxy_endpint.go b/serp/proxy_endpint.go deleted file mode 100644 index 210c2cd..0000000 --- a/serp/proxy_endpint.go +++ /dev/null @@ -1,114 +0,0 @@ -package serp - -import ( - "encoding/json" - "fmt" - "io" - "net/http" - - "github.com/mslmio/oxylabs-sdk-go/oxylabs" -) - -// ScrapeProxyOpts contains all the query paramaters available when scraping -// via Oxylabs proxy endpoint. -type ScrapeProxyOpts struct { - UserAgent oxylabs.UserAgent - GeoLocation string - Render oxylabs.Render - Parser *string - CustomHeaders map[string]string -} - -// checkParameterValidity checks validity of ScrapeProxyUrl parameters. -func (opt *ScrapeProxyOpts) checkParameterValidity() error { - if opt.UserAgent != "" && !oxylabs.IsUserAgentValid(opt.UserAgent) { - return fmt.Errorf("invalid user agent parameter: %v", opt.UserAgent) - } - - if opt.Render != "" && !oxylabs.IsRenderValid(opt.Render) { - return fmt.Errorf("invalid render parameter: %v", opt.Render) - } - - return nil -} - -// ScrapeProxyUrl scrapes a URL via Oxylabs proxy endpoint. -func (c *SerpClientProxy) ScrapeProxyUrl( - url string, - opts ...*ScrapeProxyOpts, -) (*ResponseProxy, error) { - // Prepare options. - opt := &ScrapeProxyOpts{} - if len(opts) > 0 && opts[len(opts)-1] != nil { - opt = opts[len(opts)-1] - } - - // Check validity of parameters. - if err := opt.checkParameterValidity(); err != nil { - return nil, err - } - - // Prepare request. - request, err := http.NewRequest( - "GET", - url, - nil, - ) - if err != nil { - return nil, fmt.Errorf("error creating request: %v", err) - } - - // If options are provided, add them to the request. - if opt.UserAgent != "" { - request.Header.Add("x-oxylabs-user-agent-type", string(opt.UserAgent)) - } - if opt.GeoLocation != "" { - request.Header.Add("x-oxylabs-geo-location", opt.GeoLocation) - } - if opt.Render != "" { - request.Header.Add("x-oxylabs-render", string(opt.Render)) - } - if opt.Parser != nil { - request.Header.Add("x-oxylabs-parse", "1") - request.Header.Add("x-oxylabs-parser", *opt.Parser) - } - if opt.CustomHeaders != nil { - for key, value := range opt.CustomHeaders { - request.Header.Add(key, value) - } - } - - request.SetBasicAuth(c.ApiCredentials.Username, c.ApiCredentials.Password) - - // Get response. - response, err := c.HttpClient.Do(request) - if err != nil { - return nil, fmt.Errorf("error making request: %v", err) - } - defer response.Body.Close() - - // Read response body. - responseBody, err := io.ReadAll(response.Body) - if err != nil { - return nil, fmt.Errorf("error reading response body: %v", err) - } - - // Send back error message. - if response.StatusCode != 200 { - return nil, fmt.Errorf("error with status code %s: %s", response.Status, responseBody) - } - - // Prepare response. - resp := &ResponseProxy{} - if opt.Parser != nil { - json.Unmarshal(responseBody, &resp.ContentParsed) - } else { - resp.Content = string(responseBody) - } - - // Set status code and status. - resp.StatusCode = response.StatusCode - resp.Status = response.Status - - return resp, nil -} From 608084f42b68a2203d8352ff50ec47fbb4398dae Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Wed, 20 Dec 2023 16:12:01 +0500 Subject: [PATCH 22/27] update readme --- README.md | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 278 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8238be7..2263986 100644 --- a/README.md +++ b/README.md @@ -1 +1,278 @@ -# oxylabs-sdk-go +# Oxylabs SDK Go + +Welcome to the official SERP API SDK for [Oxylabs](https://oxylabs.io). + +The Oxylabs SERP SDK simplifies interaction with the Oxylabs SERP API, providing a seamless integration for developers to retrieve search engine results pages (SERP) data with ease. + +- [Features](#features) +- [Getting Started](#getting-started) + - [Requirements](#requirements) + - [Setting Up](#setting-up) + - [Quick Start](#quick-start) +- [General Information](#general-information) + - [Integration Methods](#integration-methods) + - [Sources](#sources) + - [Query Parameters](#query-parameters) + - [Configurable Options](#configurable-options) + - [Context Options for Google Sources](#context-options-for-google-sources) +- [Integration Methods](#integration-methods-1) + - [Realtime Integration](#realtime-integration) + - [Push-Pull (Polling) Integration](#push-pull) + - [Proxy Endpoint](#proxy-endpoint) + +## Features + +- **Simplified Interface:** Abstracts away complexities, offering a straightforward user interface for interacting with the Oxylabs SERP API. + +- **Automated Request Management**: Streamlines the handling of API requests and responses for enhanced efficiency and reliability. + +- **Error Handling:** Provides meaningful error messages and handles common API errors, simplifying troubleshooting. + +- **Result Parsing:** Streamlines the process of extracting relevant data from SERP results, allowing developers to focus on application logic. + +## Getting Started +You will need an Oxylabs API username and password which you can get by signing up at https://oxylabs.io. You can check things out with a free trial at https://oxylabs.io/products/scraper-api/serp for a week. + + +### Requirements +```bash +go 1.21.0 or above +``` + +### Setting Up + +Start a local Go project if you don't have one: + +```bash +go mod init +``` + +Install the package: + +```bash +go get github.com/mslmio/oxylabs-sdk-go +``` + +### Quick Start +Basic usage of the SDK. + +```go +package main + +import ( + "fmt" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) + +func main() { + // Set your Oxylabs API Credentials. + const username = "username" + const password = "password" + + // Initialize the SERP realtime client with your credentials. + c := serp.Init(username, password) + + // Use `google_search` as a source to scrape Google with adidas as a query. + res, err := c.ScrapeGoogleSearch( + "adidas", + ) + if err != nil { + panic(err) + } + + fmt.Printf("Results: %+v\n", res) +} +``` + +## General Information + +### Integration Methods +There are three integration method for the Oxylabs SERP API. + + - Realtime (Sync) + - Push-Pull (Async) + - Proxy Endpoint + +To use either them you can just use the following init functions respectively: + +- `serp.Init(username,password)` + +- `serp.InitAsync(username,password)` + +- `proxy.Init(username,password)` + +Learn more about integration methods [on the official documentation](https://developers.oxylabs.io/scraper-apis/getting-started/integration-methods) and how this SDk uses them [here](#integration-methods-1). + +### Sources +The Oxylabs SERP API scrapes according to the source provided to the API. There are currently four search engines you can scrape with the Oxylabs SERP API all with different sources. + + +| Search Engine | Sources +| ------------- | -------------- +| **Google** | `google`, `google_search`, `google_ads`, `google_hotels`, `google_travel_hotels`, `google_images`, `google_suggest`, `google_trends_explore` +| **Yandex** | `yandex`, `yandex_search` +| **Bing** | `bing`, `bing_search` +| **Baidu** | `baidu`, `baidu_search` + + +Our SDK makes it easy for you, you just need to call the relevant function name from the client. For example if you wish to scrape Yandex with `yandex_search` as a source you +just need to invoke: + +```go + res, err :=c.ScrapeYandexSearch( + "football", + ) +``` + +### Query Parameters +Each source has different accepted query parameters. For a detailed list of accepted parameters by each source you can head over to https://developers.oxylabs.io/scraper-apis/serp-scraper-api. + +This SDK provides you with the option to query with default parameters by not sending anything as the second argument as seen in the above example. Lets say we want to send in some query parameters it is as simple as: + +```go + res, err := c.ScrapeYandexSearch( + "football", + &serp.YandexSearchOpts{ + StartPage: 1, + Pages: 3, + Limit: 4, + Domain: "com", + Locale: "en", + }, + ) +``` + +### Configurable Options +For consistency and ease of use, this SDK provides a list of pre-defined commonly used parameter values as constants in our library. + +Currently these are available for the `Render` and`UserAgent` parameters. For the full list you can check `oxylabs/types.go`. You can send in these values as strings too. + +These can be used like this: + +```go + res, err := c.ScrapeGoogleSearch( + "adidas", + &serp.GoogleSearchOpts{ + UserAgent: oxylabs.UA_DESKTOP_CHROME, //desktop_chrome + Render: oxylabs.HTML, // html + Domain: oxylabs.DOMAIN_COM, // com + } + ) +``` + +### Context Options for Google sources + +The SDK easily allows you to send in context options relevant to google sources. + +Here is an example of how you could send context options for Google Search: + +```go + res, err := c.ScrapeGoogleSearch( + "adidas", + &serp.GoogleSearchOpts{ + Parse: true, + Context: []func(serp.ContextOption){ + serp.ResultsLanguage("en"), + serp.Filter(1), + serp.Tbm("isch"), + serp.LimitPerPage([]serp.PageLimit{{Page: 1, Limit: 1}, { Page: 2, Limit: 6}}) + } + } + ) +``` +## Integration Methods + +### Realtime Integration +Realtime is a synchronous integration method. This means that upon sending your job submission request, **you will have to keep the connection open** until we successfully finish your job or return an error. + + +The **TTL** of Realtime connections is **150 seconds**. There may be rare cases where your connection times out before you receive a response from us, for example, if our system is under heavier-than-usual load or the job you submitted was extremely hard to complete: + + +### Push Pull(Polling) Integration +Push-Pull is an asynchronous integration method. This SDK implements this integration with a polling technique to poll the endpoint for results after a set interval of time. + +Using it as straightforward as using the realtime integration. The only difference is that it will return a channel with the Response. Below is an example of this integration method: + + +```go +package main + +import ( + "fmt" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" + "github.com/mslmio/oxylabs-sdk-go/serp" +) + +func main() { + const username = "username" + const password = "password" + + // Initialize the SERP push-pull client with your credentials. + c := serp.InitAsync(username, password) + + ch, err := c.ScrapeGoogleAds( + "adidas shoes", + &serp.GoogleAdsOpts{ + UserAgent: oxylabs.UA_DESKTOP, + Parse: true, + }, + ) + if err != nil { + panic(err) + } + + res := <-ch + fmt.Printf("Results: %+v\n", res) +} +``` + +### Proxy Endpoint +This method is also synchronous (like Realtime), but instead of using our service via a RESTful interface, you **can use our endpoint like a proxy**. Use Proxy Endpoint if you've used proxies before and would just like to get unblocked content from us. + +Since the parameters in this method are sent as as headers there are only a few parameters which this integration method accepts. You can find those parameters at +https://developers.oxylabs.io/scraper-apis/getting-started/integration-methods/proxy-endpoint#accepted-parameters. + +The proxy endpoint integration is very open ended allowing many different use cases. To cater this, the user is provided a pre-configured `http.Client` and they can use it as they deem fit: + +```go +package main + +import ( + "fmt" + "io" + "net/http" + + "github.com/mslmio/oxylabs-sdk-go/oxylabs" + "github.com/mslmio/oxylabs-sdk-go/proxy" +) + +func main() { + const username = "username" + const password = "password" + + // Init returns an http client pre configured with the proxy settings. + c, _ := proxy.Init(username, password) + + request , _ := http.NewRequest( + "GET", + "https://www.example.com", + nil, + ) + + // Add relevant Headers. + proxy.AddGeoLocationHeader(request, "Germany") + proxy.AddUserAgentHeader(request, oxylabs.UA_DESKTOP) + proxy.AddRenderHeader(request, "html") + proxy.AddParseHeader(request, "google_search") + + + request.SetBasicAuth(username, Password) + response, _ := c.Do(request) + + resp, _ := io.ReadAll(response.Body) + fmt.Println(string(resp)) +} +``` From c689f011b5f3505a2c33189a400188ee69dcc923 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Wed, 20 Dec 2023 16:16:21 +0500 Subject: [PATCH 23/27] comment --- proxy/proxy_helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/proxy_helpers.go b/proxy/proxy_helpers.go index 049fe1f..2bf71cf 100644 --- a/proxy/proxy_helpers.go +++ b/proxy/proxy_helpers.go @@ -6,7 +6,7 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// Helper functions to add headers to request object.s +// Helper functions to add headers to request object. func AddGeoLocationHeader(request *http.Request, geoLocation string) { request.Header.Add("x-oxylabs-geo-location", geoLocation) From 9270fd965de346fdb700f68ef8afb2c3f01a4e4b Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Wed, 20 Dec 2023 20:29:35 +0500 Subject: [PATCH 24/27] comments + spelling fixes --- proxy/client.go | 2 +- proxy/proxy_helpers.go | 9 +++------ serp/async_helpers.go | 2 +- serp/baidu.go | 4 ++-- serp/bing.go | 4 ++-- serp/context.go | 15 +++++++++++++++ serp/defaults.go | 5 +++++ 7 files changed, 29 insertions(+), 12 deletions(-) diff --git a/proxy/client.go b/proxy/client.go index ee0a0bc..722502f 100644 --- a/proxy/client.go +++ b/proxy/client.go @@ -7,7 +7,7 @@ import ( "net/url" ) -// Init for Proxy runtime model. +// Init initializes and returns an http client configured with oxylabs proxy settings. func Init( username string, password string, diff --git a/proxy/proxy_helpers.go b/proxy/proxy_helpers.go index 2bf71cf..6238e44 100644 --- a/proxy/proxy_helpers.go +++ b/proxy/proxy_helpers.go @@ -6,20 +6,17 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) -// Helper functions to add headers to request object. - -func AddGeoLocationHeader(request *http.Request, geoLocation string) { - request.Header.Add("x-oxylabs-geo-location", geoLocation) -} - +// AddUserAgentHeader adds the user_agent_type header to the request. func AddUserAgentHeader(request *http.Request, userAgent oxylabs.UserAgent) { request.Header.Add("x-oxylabs-user-agent-type", string(userAgent)) } +// AddRenderHeader adds the render header to the request. func AddRenderHeader(request *http.Request, render oxylabs.Render) { request.Header.Add("x-oxylabs-render", string(render)) } +// AddParseHeader adds the parse and parser headers to the request. func AddParseHeader(request *http.Request, parser string) { request.Header.Add("x-oxylabs-parse", "1") request.Header.Add("x-oxylabs-parser", parser) diff --git a/serp/async_helpers.go b/serp/async_helpers.go index 5674ad9..89813e3 100644 --- a/serp/async_helpers.go +++ b/serp/async_helpers.go @@ -42,7 +42,7 @@ func (c *SerpClientAsync) GetJobID( return job.ID, nil } -// Helper function to handle response parsing and error checking +// Helper function to handle response parsing and error checking. func (c *SerpClientAsync) GetResponse( jobID string, parse bool, diff --git a/serp/baidu.go b/serp/baidu.go index 245eb8f..a3e6452 100644 --- a/serp/baidu.go +++ b/serp/baidu.go @@ -39,7 +39,7 @@ func (opt *BaiduUrlOpts) checkParameterValidity() error { return nil } -// BaiduSearchOpts contains all the query paramaters available for baidu_search. +// BaiduSearchOpts contains all the query parameters available for baidu_search. type BaiduSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -97,7 +97,7 @@ func (c *SerpClient) ScrapeBaiduSearch( return res, nil } -// BaiduUrlOpts contains all the query paramaters available for baidu. +// BaiduUrlOpts contains all the query parameters available for baidu. type BaiduUrlOpts struct { UserAgent oxylabs.UserAgent CallbackUrl string diff --git a/serp/bing.go b/serp/bing.go index bb23939..2a90964 100644 --- a/serp/bing.go +++ b/serp/bing.go @@ -46,7 +46,7 @@ func (opt *BingUrlOpts) checkParameterValidity() error { return nil } -// BingSearchOpts contains all the query paramaters available for bing_search. +// BingSearchOpts contains all the query parameters available for bing_search. type BingSearchOpts struct { Domain oxylabs.Domain StartPage int @@ -110,7 +110,7 @@ func (c *SerpClient) ScrapeBingSearch( return res, nil } -// BingUrlOpts contains all the query paramaters available for bing. +// BingUrlOpts contains all the query parameters available for bing. type BingUrlOpts struct { UserAgent oxylabs.UserAgent GeoLocation *string diff --git a/serp/context.go b/serp/context.go index 5df932e..e624091 100644 --- a/serp/context.go +++ b/serp/context.go @@ -7,90 +7,105 @@ type PageLimit struct { Limit int `json:"limit"` } +// LimitPerPage sets the limits_per_page context option. func LimitPerPage(limits []PageLimit) func(ContextOption) { return func(ctx ContextOption) { ctx["limit_per_page"] = limits } } +// ResultsLanguage sets the results_language context option. func ResultsLanguage(lang string) func(ContextOption) { return func(ctx ContextOption) { ctx["results_language"] = lang } } +// Filter sets the filter context option. func Filter(filter int) func(ContextOption) { return func(ctx ContextOption) { ctx["filter"] = filter } } +// Nfpr sets the nfpr context option. func Nfpr(nfpr bool) func(ContextOption) { return func(ctx ContextOption) { ctx["nfpr"] = nfpr } } +// SafeSearch sets the safe_search context option. func SafeSearch(safeSearch bool) func(ContextOption) { return func(ctx ContextOption) { ctx["safe_search"] = safeSearch } } +// Fpstate sets the fpstate context option. func Fpstate(fpstate string) func(ContextOption) { return func(ctx ContextOption) { ctx["fpstate"] = fpstate } } +// Tbm sets the tbm context option. func Tbm(tbm string) func(ContextOption) { return func(ctx ContextOption) { ctx["tbm"] = tbm } } +// Tbs sets the tbs context option. func Tbs(tbs string) func(ContextOption) { return func(ctx ContextOption) { ctx["tbs"] = tbs } } +// HotelOccupancy sets the hotel_occupancy context option. func HotelOccupancy(num int) func(ContextOption) { return func(ctx ContextOption) { ctx["hotel_occupancy"] = num } } +// HotelDates sets the hotel_dates context option. func HotelDates(dates string) func(ContextOption) { return func(ctx ContextOption) { ctx["hotel_dates"] = dates } } +// HotelClasses sets the hotel_classes context option. func HotelClasses(classes []int) func(ContextOption) { return func(ctx ContextOption) { ctx["hotel_classes"] = classes } } +// SearchType sets the search_type context option. func SearchType(searchType string) func(ContextOption) { return func(ctx ContextOption) { ctx["search_type"] = searchType } } +// DateFrom sets the date_from context option. func DateFrom(dateFrom string) func(ContextOption) { return func(ctx ContextOption) { ctx["date_from"] = dateFrom } } +// DateTo sets the date_to context option. func DateTo(dateTo string) func(ContextOption) { return func(ctx ContextOption) { ctx["date_to"] = dateTo } } +// CategoryId sets the category_id context option. func CategoryId(categoryId int) func(ContextOption) { return func(ctx ContextOption) { ctx["category_id"] = categoryId diff --git a/serp/defaults.go b/serp/defaults.go index aa48021..87cbc44 100644 --- a/serp/defaults.go +++ b/serp/defaults.go @@ -4,30 +4,35 @@ import ( "github.com/mslmio/oxylabs-sdk-go/oxylabs" ) +// SetDefaultDomain sets the domain parameter if it is not set. func SetDefaultDomain(domain *oxylabs.Domain) { if *domain == "" { *domain = oxylabs.DOMAIN_COM } } +// SetDefaultStartPage sets the start_page parameter if it is not set. func SetDefaultStartPage(startPage *int) { if *startPage == 0 { *startPage = 1 } } +// SetDefaultPages sets the pages parameter if it is not set. func SetDefaultPages(pages *int) { if *pages == 0 { *pages = 1 } } +// SetDefaultLimit sets the limit parameter if it is not set. func SetDefaultLimit(limit *int) { if *limit == 0 { *limit = 10 } } +// SetDefaultUserAgent sets the user_agent parameter if it is not set. func SetDefaultUserAgent(userAgent *oxylabs.UserAgent) { if *userAgent == "" { *userAgent = oxylabs.UA_DESKTOP From 8a4b9c88d9da72a64eb253acd2f17eeebd8e34c2 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Wed, 20 Dec 2023 20:34:54 +0500 Subject: [PATCH 25/27] update readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 2263986..934b286 100644 --- a/README.md +++ b/README.md @@ -263,7 +263,6 @@ func main() { ) // Add relevant Headers. - proxy.AddGeoLocationHeader(request, "Germany") proxy.AddUserAgentHeader(request, oxylabs.UA_DESKTOP) proxy.AddRenderHeader(request, "html") proxy.AddParseHeader(request, "google_search") From fe7dda5a742f6b09f2e2687d671162a45f8479b4 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Thu, 21 Dec 2023 14:35:38 +0500 Subject: [PATCH 26/27] update readme --- README.md | 177 +++++++++++++++++++++++++++++------------------------- 1 file changed, 95 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index 934b286..40952e2 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,59 @@ # Oxylabs SDK Go -Welcome to the official SERP API SDK for [Oxylabs](https://oxylabs.io). +This is a Go SDK for the [Oxylabs](https://oxylabs.io) [Scraper APIs](https://developers.oxylabs.io/scraper-apis/getting-started). -The Oxylabs SERP SDK simplifies interaction with the Oxylabs SERP API, providing a seamless integration for developers to retrieve search engine results pages (SERP) data with ease. +This will help simplify integrating with Oxylabs's APIs, which can help you with retrieving search engine results (SERP), eCommerce data, real estate data, and more. + +Some technical features include but are not limited to: + +### Simplified Interface + +Abstracts away complexities, offering a straightforward user interface for interacting with the Oxylabs SERP API. -- [Features](#features) -- [Getting Started](#getting-started) - - [Requirements](#requirements) - - [Setting Up](#setting-up) - - [Quick Start](#quick-start) -- [General Information](#general-information) - - [Integration Methods](#integration-methods) - - [Sources](#sources) - - [Query Parameters](#query-parameters) - - [Configurable Options](#configurable-options) - - [Context Options for Google Sources](#context-options-for-google-sources) -- [Integration Methods](#integration-methods-1) - - [Realtime Integration](#realtime-integration) - - [Push-Pull (Polling) Integration](#push-pull) - - [Proxy Endpoint](#proxy-endpoint) +### Automated Request Management -## Features +Streamlines the handling of API requests and responses for enhanced efficiency and reliability. -- **Simplified Interface:** Abstracts away complexities, offering a straightforward user interface for interacting with the Oxylabs SERP API. +### Error Handling -- **Automated Request Management**: Streamlines the handling of API requests and responses for enhanced efficiency and reliability. +Provides meaningful error messages and handles common API errors, simplifying troubleshooting. -- **Error Handling:** Provides meaningful error messages and handles common API errors, simplifying troubleshooting. +### Result Parsing -- **Result Parsing:** Streamlines the process of extracting relevant data from SERP results, allowing developers to focus on application logic. +Streamlines the process of extracting relevant data from SERP results, allowing developers to focus on application logic. ## Getting Started + You will need an Oxylabs API username and password which you can get by signing up at https://oxylabs.io. You can check things out with a free trial at https://oxylabs.io/products/scraper-api/serp for a week. +### Setting Up -### Requirements -```bash -go 1.21.0 or above -``` +This SDK requires a minimum version of `go 1.21`. -### Setting Up +You can check your go version by running the following command in your preferred terminal: + +```sh +go version +``` + +If you need to install or update go you can do so by following the steps mentioned [here](https://go.dev/doc/install). -Start a local Go project if you don't have one: +#### Initialize Project -```bash -go mod init +```sh +$ mkdir ~/oxylabs-sdk +$ cd ~/oxylabs-sdk +$ go mod init oxylabs-sdk ``` -Install the package: +#### Install SDK package -```bash -go get github.com/mslmio/oxylabs-sdk-go +```sh +$ go get github.com/mslmio/oxylabs-sdk-go ``` ### Quick Start + Basic usage of the SDK. ```go @@ -62,26 +62,26 @@ package main import ( "fmt" - "github.com/mslmio/oxylabs-sdk-go/oxylabs" + "github.com/mslmio/oxylabs-sdk-go/serp" ) func main() { - // Set your Oxylabs API Credentials. + // Set your Oxylabs API Credentials. const username = "username" const password = "password" - // Initialize the SERP realtime client with your credentials. + // Initialize the SERP realtime client with your credentials. c := serp.Init(username, password) - // Use `google_search` as a source to scrape Google with adidas as a query. + // Use `google_search` as a source to scrape Google with adidas as a query. res, err := c.ScrapeGoogleSearch( "adidas", ) if err != nil { panic(err) - } + } - fmt.Printf("Results: %+v\n", res) + fmt.Printf("Results: %+v\n", res) } ``` @@ -120,9 +120,9 @@ Our SDK makes it easy for you, you just need to call the relevant function name just need to invoke: ```go - res, err :=c.ScrapeYandexSearch( - "football", - ) +res, err := c.ScrapeYandexSearch( + "football", +) ``` ### Query Parameters @@ -131,34 +131,40 @@ Each source has different accepted query parameters. For a detailed list of acce This SDK provides you with the option to query with default parameters by not sending anything as the second argument as seen in the above example. Lets say we want to send in some query parameters it is as simple as: ```go - res, err := c.ScrapeYandexSearch( - "football", - &serp.YandexSearchOpts{ - StartPage: 1, - Pages: 3, - Limit: 4, - Domain: "com", - Locale: "en", - }, - ) +res, err := c.ScrapeYandexSearch( + "football", + &serp.YandexSearchOpts{ + StartPage: 1, + Pages: 3, + Limit: 4, + Domain: "com", + Locale: "en", + }, +) ``` ### Configurable Options -For consistency and ease of use, this SDK provides a list of pre-defined commonly used parameter values as constants in our library. +For consistency and ease of use, this SDK provides a list of pre-defined commonly used parameter values as constants in our library. You can use them by importing the oxylabs package. + +```go +import ( + "github.com/mslmio/oxylabs-sdk-go/oxylabs" +) +``` Currently these are available for the `Render` and`UserAgent` parameters. For the full list you can check `oxylabs/types.go`. You can send in these values as strings too. These can be used like this: ```go - res, err := c.ScrapeGoogleSearch( - "adidas", - &serp.GoogleSearchOpts{ - UserAgent: oxylabs.UA_DESKTOP_CHROME, //desktop_chrome - Render: oxylabs.HTML, // html - Domain: oxylabs.DOMAIN_COM, // com - } - ) +res, err := c.ScrapeGoogleSearch( + "adidas", + &serp.GoogleSearchOpts{ + UserAgent: oxylabs.UA_DESKTOP_CHROME, //desktop_chrome + Render: oxylabs.HTML, // html + Domain: oxylabs.DOMAIN_COM, // com + }, +) ``` ### Context Options for Google sources @@ -168,22 +174,23 @@ The SDK easily allows you to send in context options relevant to google sources. Here is an example of how you could send context options for Google Search: ```go - res, err := c.ScrapeGoogleSearch( - "adidas", - &serp.GoogleSearchOpts{ - Parse: true, - Context: []func(serp.ContextOption){ - serp.ResultsLanguage("en"), - serp.Filter(1), - serp.Tbm("isch"), - serp.LimitPerPage([]serp.PageLimit{{Page: 1, Limit: 1}, { Page: 2, Limit: 6}}) - } - } - ) +res, err := c.ScrapeGoogleSearch( + "adidas", + &serp.GoogleSearchOpts{ + Parse: true, + Context: []func(serp.ContextOption){ + serp.ResultsLanguage("en"), + serp.Filter(1), + serp.Tbm("isch"), + serp.LimitPerPage([]serp.PageLimit{{Page: 1, Limit: 1}, {Page: 2, Limit: 6}}), + }, + }, +) ``` ## Integration Methods ### Realtime Integration + Realtime is a synchronous integration method. This means that upon sending your job submission request, **you will have to keep the connection open** until we successfully finish your job or return an error. @@ -191,6 +198,7 @@ The **TTL** of Realtime connections is **150 seconds**. There may be rare cases ### Push Pull(Polling) Integration + Push-Pull is an asynchronous integration method. This SDK implements this integration with a polling technique to poll the endpoint for results after a set interval of time. Using it as straightforward as using the realtime integration. The only difference is that it will return a channel with the Response. Below is an example of this integration method: @@ -210,7 +218,7 @@ func main() { const username = "username" const password = "password" - // Initialize the SERP push-pull client with your credentials. + // Initialize the SERP push-pull client with your credentials. c := serp.InitAsync(username, password) ch, err := c.ScrapeGoogleAds( @@ -223,13 +231,14 @@ func main() { if err != nil { panic(err) } - - res := <-ch - fmt.Printf("Results: %+v\n", res) + + res := <-ch + fmt.Printf("Results: %+v\n", res) } ``` ### Proxy Endpoint + This method is also synchronous (like Realtime), but instead of using our service via a RESTful interface, you **can use our endpoint like a proxy**. Use Proxy Endpoint if you've used proxies before and would just like to get unblocked content from us. Since the parameters in this method are sent as as headers there are only a few parameters which this integration method accepts. You can find those parameters at @@ -256,7 +265,7 @@ func main() { // Init returns an http client pre configured with the proxy settings. c, _ := proxy.Init(username, password) - request , _ := http.NewRequest( + request, _ := http.NewRequest( "GET", "https://www.example.com", nil, @@ -264,14 +273,18 @@ func main() { // Add relevant Headers. proxy.AddUserAgentHeader(request, oxylabs.UA_DESKTOP) - proxy.AddRenderHeader(request, "html") + proxy.AddRenderHeader(request, oxylabs.HTML) proxy.AddParseHeader(request, "google_search") - - request.SetBasicAuth(username, Password) + request.SetBasicAuth(username, password) response, _ := c.Do(request) - resp, _ := io.ReadAll(response.Body) + resp, _ := io.ReadAll(response.Body) fmt.Println(string(resp)) } ``` + +## About Oxylabs +Established in 2015, Oxylabs are a market-leading web intelligence collection platform, driven by the highest business, ethics, and compliance standards, enabling companies worldwide to unlock data-driven insights. + +[![image](https://oxylabs.io/images/og-image.png)](https://oxylabs.io/) \ No newline at end of file From c250f364dd18602a676f89c65cbfeda43c5129f3 Mon Sep 17 00:00:00 2001 From: Maaz Munir Date: Thu, 21 Dec 2023 16:04:49 +0500 Subject: [PATCH 27/27] fmt --- README.md | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 40952e2..3694def 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This is a Go SDK for the [Oxylabs](https://oxylabs.io) [Scraper APIs](https://developers.oxylabs.io/scraper-apis/getting-started). This will help simplify integrating with Oxylabs's APIs, which can help you with retrieving search engine results (SERP), eCommerce data, real estate data, and more. - + Some technical features include but are not limited to: ### Simplified Interface @@ -24,11 +24,11 @@ Streamlines the process of extracting relevant data from SERP results, allowing ## Getting Started -You will need an Oxylabs API username and password which you can get by signing up at https://oxylabs.io. You can check things out with a free trial at https://oxylabs.io/products/scraper-api/serp for a week. +You will need an Oxylabs API username and password which you can get by signing up at https://oxylabs.io. You can check things out with a free trial at https://oxylabs.io/products/scraper-api/serp for a week. ### Setting Up -This SDK requires a minimum version of `go 1.21`. +This SDK requires a minimum version of `go 1.21`. You can check your go version by running the following command in your preferred terminal: @@ -88,11 +88,11 @@ func main() { ## General Information ### Integration Methods + There are three integration method for the Oxylabs SERP API. - - Realtime (Sync) - Push-Pull (Async) - - Proxy Endpoint + - Proxy Endpoint To use either them you can just use the following init functions respectively: @@ -105,18 +105,16 @@ To use either them you can just use the following init functions respectively: Learn more about integration methods [on the official documentation](https://developers.oxylabs.io/scraper-apis/getting-started/integration-methods) and how this SDk uses them [here](#integration-methods-1). ### Sources -The Oxylabs SERP API scrapes according to the source provided to the API. There are currently four search engines you can scrape with the Oxylabs SERP API all with different sources. - -| Search Engine | Sources -| ------------- | -------------- +The Oxylabs SERP API scrapes according to the source provided to the API. There are currently four search engines you can scrape with the Oxylabs SERP API all with different sources. +| Search Engine | Sources +| ------------- | -------------- | **Google** | `google`, `google_search`, `google_ads`, `google_hotels`, `google_travel_hotels`, `google_images`, `google_suggest`, `google_trends_explore` | **Yandex** | `yandex`, `yandex_search` -| **Bing** | `bing`, `bing_search` +| **Bing** | `bing`, `bing_search` | **Baidu** | `baidu`, `baidu_search` - -Our SDK makes it easy for you, you just need to call the relevant function name from the client. For example if you wish to scrape Yandex with `yandex_search` as a source you +Our SDK makes it easy for you, you just need to call the relevant function name from the client. For example if you wish to scrape Yandex with `yandex_search` as a source you just need to invoke: ```go @@ -126,6 +124,7 @@ res, err := c.ScrapeYandexSearch( ``` ### Query Parameters + Each source has different accepted query parameters. For a detailed list of accepted parameters by each source you can head over to https://developers.oxylabs.io/scraper-apis/serp-scraper-api. This SDK provides you with the option to query with default parameters by not sending anything as the second argument as seen in the above example. Lets say we want to send in some query parameters it is as simple as: @@ -144,6 +143,7 @@ res, err := c.ScrapeYandexSearch( ``` ### Configurable Options + For consistency and ease of use, this SDK provides a list of pre-defined commonly used parameter values as constants in our library. You can use them by importing the oxylabs package. ```go @@ -154,7 +154,7 @@ import ( Currently these are available for the `Render` and`UserAgent` parameters. For the full list you can check `oxylabs/types.go`. You can send in these values as strings too. -These can be used like this: +These can be used like this: ```go res, err := c.ScrapeGoogleSearch( @@ -187,23 +187,21 @@ res, err := c.ScrapeGoogleSearch( }, ) ``` + ## Integration Methods ### Realtime Integration Realtime is a synchronous integration method. This means that upon sending your job submission request, **you will have to keep the connection open** until we successfully finish your job or return an error. - The **TTL** of Realtime connections is **150 seconds**. There may be rare cases where your connection times out before you receive a response from us, for example, if our system is under heavier-than-usual load or the job you submitted was extremely hard to complete: - ### Push Pull(Polling) Integration Push-Pull is an asynchronous integration method. This SDK implements this integration with a polling technique to poll the endpoint for results after a set interval of time. Using it as straightforward as using the realtime integration. The only difference is that it will return a channel with the Response. Below is an example of this integration method: - ```go package main @@ -241,7 +239,7 @@ func main() { This method is also synchronous (like Realtime), but instead of using our service via a RESTful interface, you **can use our endpoint like a proxy**. Use Proxy Endpoint if you've used proxies before and would just like to get unblocked content from us. -Since the parameters in this method are sent as as headers there are only a few parameters which this integration method accepts. You can find those parameters at +Since the parameters in this method are sent as as headers there are only a few parameters which this integration method accepts. You can find those parameters at https://developers.oxylabs.io/scraper-apis/getting-started/integration-methods/proxy-endpoint#accepted-parameters. The proxy endpoint integration is very open ended allowing many different use cases. To cater this, the user is provided a pre-configured `http.Client` and they can use it as they deem fit: @@ -285,6 +283,7 @@ func main() { ``` ## About Oxylabs + Established in 2015, Oxylabs are a market-leading web intelligence collection platform, driven by the highest business, ethics, and compliance standards, enabling companies worldwide to unlock data-driven insights. [![image](https://oxylabs.io/images/og-image.png)](https://oxylabs.io/) \ No newline at end of file