SeemGen/Services/BrightDataService.cs

128 lines
4.7 KiB
C#

using BLAIzor.Interfaces;
using System.Net.Http;
using System.Text;
using System.Text.Json;
namespace BLAIzor.Services
{
public class BrightDataService : IBrightDataService
{
private readonly ISimpleLogger _logger;
private readonly IHttpClientFactory _httpClientFactory;
private string _apiToken;
public static IConfiguration? _configuration;
public BrightDataService(ISimpleLogger logger, IHttpClientFactory httpClientFactory, IConfiguration configuration)
{
_logger = logger;
_httpClientFactory = httpClientFactory;
_configuration = configuration;
}
private string GetScraperSettings() =>
_configuration?.GetSection("ScraperSettings")?.GetValue<string>("Provider") ?? string.Empty;
public string GetApiKey()
{
if (_configuration == null)
{
return string.Empty;
}
if (_configuration.GetSection("ScraperSettings") == null)
{
return string.Empty;
}
return _configuration.GetSection("ScraperSettings").GetValue<string>("ApiKey")!;
}
public async Task<string?> ScrapeFacebookPostsAsync(string pageUrl, int numPosts = 10)
{
if (string.IsNullOrWhiteSpace(pageUrl))
return null;
_apiToken = GetApiKey();
try
{
var client = _httpClientFactory.CreateClient();
client.DefaultRequestHeaders.Add("Authorization", $"Bearer {_apiToken}");
var url = "https://api.brightdata.com/datasets/v3/trigger?dataset_id=gd_lkaxegm826bjpoo9m5&include_errors=true&limit_multiple_results=20";
var payload = new[]
{
new
{
url = pageUrl,
num_of_posts = numPosts,
posts_to_not_include = Array.Empty<string>(),
start_date = "",
end_date = ""
}
};
var json = JsonSerializer.Serialize(payload);
var response = await client.PostAsync(url, new StringContent(json, Encoding.UTF8, "application/json"));
response.EnsureSuccessStatusCode();
var scrapeId = await response.Content.ReadAsStringAsync();
//"snapshot_id
if (string.IsNullOrWhiteSpace(scrapeId))
{
await _logger.ErrorAsync("Failed to initiate scraping for Facebook posts.");
return null;
}
else
{
//have to keep checking:
//result: "{\"snapshot_id\":\"s_mec12qv422avgbv9jl\"}"
//let's extract the scrapeId from the response
scrapeId = scrapeId.Trim('"').Split(':')[1].Trim('"');
//remove all other characters
scrapeId = scrapeId.Replace("{", "").Replace("}", "").Replace("\"", "");
var checkUrl = $"https://api.brightdata.com/datasets/v3/progress/{scrapeId}";
var statusResponse = await client.GetAsync(checkUrl);
var responseString = await statusResponse.Content.ReadAsStringAsync();
int attempt = 0;
//make a cycle
while (responseString.Contains("status") && !responseString.Contains("ready"))
{
if (attempt >= 60)
{
await _logger.ErrorAsync($"Failed to get scraping status for Facebook page: {pageUrl} after multiple attempts.");
return null;
}
// Wait for a while before retrying
await Task.Delay(5000); // Wait for 5 seconds
statusResponse = await client.GetAsync(checkUrl);
responseString = await statusResponse.Content.ReadAsStringAsync();
attempt++;
}
// Now fetch the snapshot
var snapshotUrl = $"https://api.brightdata.com/datasets/v3/snapshot/{scrapeId}?format=json";
var snapshotResponse = await client.GetAsync(snapshotUrl);
var snapshotString = await snapshotResponse.Content.ReadAsStringAsync();
return snapshotString;
}
}
catch (Exception ex)
{
Console.WriteLine($"Error scraping Facebook: {ex.Message}");
return null;
}
}
}
}