From bebb50ac67726637eb6c5725c4d569017a8778a3 Mon Sep 17 00:00:00 2001 From: Bill Metangmo <25366207+billmetangmo@users.noreply.github.com> Date: Wed, 26 Jul 2023 11:58:06 +0000 Subject: [PATCH] fix: 403 http error consulcam website (#126) issue caused by activation of cpanel badbots rule --- infra/api/scan.py | 19 ++++++++++++------- infra/main.tf | 2 +- infra/vars.tf | 4 ---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/infra/api/scan.py b/infra/api/scan.py index 2760652..5453474 100644 --- a/infra/api/scan.py +++ b/infra/api/scan.py @@ -1,5 +1,5 @@ import sys - +import shutil sys.path.insert(0, "./package") import requests import urllib.request @@ -13,6 +13,9 @@ bucket_name = os.environ["BUCKET_NAME"] Table_Links = os.environ["LINKS_TABLE"] maintainer_mail = os.environ["MAINTAINER_MAIL"] +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0" +} def S3_bucket_pictures(Picture_image, bucket_name): @@ -37,7 +40,12 @@ def dowload_image(url): """ name = url.split("/")[-1] real_image = f"/tmp/{str(name)}" # image in jpg version ( only /tmp is writable in aws lambda) - urllib.request.urlretrieve(url, real_image) + r = requests.get(url,stream=True,headers=headers) + r.raw.decode_content = True + r.raise_for_status() + + with open( real_image, 'wb') as f: + shutil.copyfileobj(r.raw, f) return real_image @@ -48,12 +56,9 @@ def get_source_code(link): :param link: the link of the web page you want to scrape :return: the source code of the web page """ - proxy_url = os.environ["PROXY_URL"] - proxies = {"http": proxy_url, "https": proxy_url} - - r = requests.get(link,proxies=proxies, verify=False) + r = requests.get(link,headers=headers) r.raise_for_status() - return soup(r.text) + return soup(r.text,features="html.parser") def filter(code_source_html): diff --git a/infra/main.tf b/infra/main.tf index 0f62771..f15a27b 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -184,7 +184,7 @@ resource "aws_lambda_function" "scan" { API_KEY = var.API_KEY SENTRY_DNS = var.SENTRY_DNS ENV = (terraform.workspace == "mtchoun-mouh-master") ? "production" : "${terraform.workspace}" - PROXY_URL = var.PROXY_URL + } } diff --git a/infra/vars.tf b/infra/vars.tf index 8e20124..d851d46 100644 --- a/infra/vars.tf +++ b/infra/vars.tf @@ -59,7 +59,3 @@ variable "TFC_WORKSPACE_NAME" { type = string default = "" } - -variable "PROXY_URL" { - type = string -}