mirror of
https://github.com/lemeow125/DRF_Template.git
synced 2024-11-16 19:59:24 +08:00
Update webdriver utility functions and add a sample celery task for scraping Google search
This commit is contained in:
parent
0e902b1f04
commit
9e2e32fba8
11 changed files with 193 additions and 25 deletions
|
@ -111,7 +111,8 @@ INSTALLED_APPS = [
|
||||||
'payments',
|
'payments',
|
||||||
'billing',
|
'billing',
|
||||||
'emails',
|
'emails',
|
||||||
'notifications'
|
'notifications',
|
||||||
|
'search_results'
|
||||||
]
|
]
|
||||||
|
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
|
@ -393,5 +394,5 @@ CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers:DatabaseScheduler'
|
||||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = 20480
|
DATA_UPLOAD_MAX_NUMBER_FIELDS = 20480
|
||||||
|
|
||||||
GRAPH_MODELS = {
|
GRAPH_MODELS = {
|
||||||
'app_labels': ['accounts', 'user_groups', 'billing', 'emails', 'payments', 'subscriptions']
|
'app_labels': ['accounts', 'user_groups', 'billing', 'emails', 'payments', 'subscriptions', 'search_results']
|
||||||
}
|
}
|
||||||
|
|
0
backend/search_results/__init__.py
Normal file
0
backend/search_results/__init__.py
Normal file
10
backend/search_results/admin.py
Normal file
10
backend/search_results/admin.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
from unfold.admin import ModelAdmin
|
||||||
|
from django.contrib import admin
|
||||||
|
from .models import SearchResult
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(SearchResult)
|
||||||
|
class SearchResultAdmin(ModelAdmin):
|
||||||
|
model = SearchResult
|
||||||
|
search_fields = ('id', 'title', 'link')
|
||||||
|
list_display = ['id', 'title']
|
6
backend/search_results/apps.py
Normal file
6
backend/search_results/apps.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class SearchResultsConfig(AppConfig):
|
||||||
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
|
name = 'search_results'
|
23
backend/search_results/migrations/0001_initial.py
Normal file
23
backend/search_results/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# Generated by Django 5.0.6 on 2024-09-24 07:47
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='SearchResult',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('title', models.CharField(max_length=1000)),
|
||||||
|
('link', models.CharField(max_length=1000)),
|
||||||
|
('timestamp', models.DateTimeField(auto_now_add=True)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
0
backend/search_results/migrations/__init__.py
Normal file
0
backend/search_results/migrations/__init__.py
Normal file
7
backend/search_results/models.py
Normal file
7
backend/search_results/models.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class SearchResult(models.Model):
|
||||||
|
title = models.CharField(max_length=1000)
|
||||||
|
link = models.CharField(max_length=1000)
|
||||||
|
timestamp = models.DateTimeField(auto_now_add=True, editable=False)
|
16
backend/search_results/tasks.py
Normal file
16
backend/search_results/tasks.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
|
||||||
|
|
||||||
|
from celery import shared_task
|
||||||
|
from .models import SearchResult
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(autoretry_for=(Exception,), retry_kwargs={'max_retries': 0, 'countdown': 5})
|
||||||
|
def create_search_result(title, link):
|
||||||
|
if SearchResult.objects.filter(title=title, link=link).exists():
|
||||||
|
return ("SearchResult entry already exists")
|
||||||
|
else:
|
||||||
|
SearchResult.objects.create(
|
||||||
|
title=title,
|
||||||
|
link=link
|
||||||
|
)
|
||||||
|
return f"Created new SearchResult entry titled: {title}"
|
|
@ -1,16 +1,43 @@
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from webdriver.utils import setup_webdriver, selenium_action_template
|
from webdriver.utils import setup_webdriver, selenium_action_template, google_search, get_element, get_elements
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
# Sample Celery Selenium function
|
from search_results.tasks import create_search_result
|
||||||
# TODO: Modify this as needed
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task(autoretry_for=(Exception,), retry_kwargs={'max_retries': 6, 'countdown': 5})
|
# Task template
|
||||||
|
@shared_task(autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 5})
|
||||||
def sample_selenium_task():
|
def sample_selenium_task():
|
||||||
driver = setup_webdriver()
|
|
||||||
|
driver = setup_webdriver(use_proxy=False, use_saved_session=False)
|
||||||
|
driver.get("Place URL here")
|
||||||
selenium_action_template(driver)
|
selenium_action_template(driver)
|
||||||
# Place any other actions here after Selenium is done executing
|
|
||||||
|
# TODO: Modify this as needed
|
||||||
|
|
||||||
# Once completed, always close the session
|
# Once completed, always close the session
|
||||||
driver.close()
|
driver.close()
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
|
# Sample task to scrape Google for search results based on a keyword
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 5})
|
||||||
|
def simple_google_search():
|
||||||
|
driver = setup_webdriver(use_proxy=False, use_saved_session=False)
|
||||||
|
driver.get(f"https://google.com/")
|
||||||
|
|
||||||
|
google_search(driver, search_term="cat blog posts")
|
||||||
|
|
||||||
|
# Count number of Google search results
|
||||||
|
search_items = get_elements(
|
||||||
|
driver, "xpath", '//*[@id="search"]/div[1]/div[1]/*')
|
||||||
|
|
||||||
|
for item in search_items:
|
||||||
|
title = item.find_element(By.TAG_NAME, 'h3').text
|
||||||
|
link = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
|
||||||
|
create_search_result.apply_async(
|
||||||
|
kwargs={"title": title, "link": link})
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
driver.quit()
|
||||||
|
|
|
@ -3,9 +3,7 @@ Settings file to hold constants and functions
|
||||||
"""
|
"""
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
import os
|
|
||||||
from config.settings import get_secret
|
from config.settings import get_secret
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver import FirefoxOptions
|
from selenium.webdriver import FirefoxOptions
|
||||||
|
@ -16,6 +14,8 @@ from config.settings import get_secret
|
||||||
from twocaptcha import TwoCaptcha
|
from twocaptcha import TwoCaptcha
|
||||||
from whois import whois
|
from whois import whois
|
||||||
from whois.parser import PywhoisError
|
from whois.parser import PywhoisError
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
def take_snapshot(driver, filename='dump.png'):
|
def take_snapshot(driver, filename='dump.png'):
|
||||||
|
@ -135,23 +135,19 @@ def setup_webdriver(driver_type="chrome", use_proxy=True, use_saved_session=Fals
|
||||||
print(f'External IP: {ip_address}')
|
print(f'External IP: {ip_address}')
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
# These are wrapper function for quickly automating multiple steps in webscraping (logins, button presses, text inputs, etc.)
|
||||||
|
# Depending on your use case, you may have to opt out of using this
|
||||||
|
|
||||||
|
|
||||||
# Function to get the element once it has loaded in
|
# Function to get the element once it has loaded in
|
||||||
|
|
||||||
|
|
||||||
def get_element(driver, by, key, hidden_element=False, timeout=8):
|
def get_element(driver, by, key, hidden_element=False, timeout=8):
|
||||||
try:
|
try:
|
||||||
if by == "xpath":
|
# Convert string-based locators to By objects (By.XPATH, By.CSS, etc.)
|
||||||
by = By.XPATH
|
if isinstance(by, str):
|
||||||
elif by == "css":
|
by = getattr(By, by.upper())
|
||||||
by = By.CSS_SELECTOR
|
|
||||||
elif by == "id":
|
|
||||||
by = By.ID
|
|
||||||
elif by == "tagname":
|
|
||||||
by = By.TAG_NAME
|
|
||||||
elif by == "name":
|
|
||||||
by = By.NAME
|
|
||||||
elif by == "classname":
|
|
||||||
by == By.CLASS_NAME
|
|
||||||
wait = WebDriverWait(driver, timeout=timeout)
|
wait = WebDriverWait(driver, timeout=timeout)
|
||||||
if not hidden_element:
|
if not hidden_element:
|
||||||
element = wait.until(
|
element = wait.until(
|
||||||
|
@ -161,9 +157,39 @@ def get_element(driver, by, key, hidden_element=False, timeout=8):
|
||||||
(by, key)))
|
(by, key)))
|
||||||
return element
|
return element
|
||||||
except Exception:
|
except Exception:
|
||||||
|
dump_html(driver)
|
||||||
|
take_snapshot(driver)
|
||||||
|
driver.close()
|
||||||
|
driver.quit()
|
||||||
raise Exception(f"Unable to get element of {by} value: {key}")
|
raise Exception(f"Unable to get element of {by} value: {key}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements(driver, by, key, hidden_element=False, timeout=8):
|
||||||
|
try:
|
||||||
|
# Convert string-based locators to By objects (By.XPATH, By.CSS, etc.)
|
||||||
|
if isinstance(by, str):
|
||||||
|
by = getattr(By, by.upper())
|
||||||
|
|
||||||
|
wait = WebDriverWait(driver, timeout=timeout)
|
||||||
|
|
||||||
|
if hidden_element:
|
||||||
|
elements = wait.until(
|
||||||
|
EC.presence_of_all_elements_located((by, key)))
|
||||||
|
else:
|
||||||
|
visible_elements = wait.until(
|
||||||
|
EC.visibility_of_any_elements_located((by, key)))
|
||||||
|
elements = [
|
||||||
|
element for element in visible_elements if element.is_enabled()]
|
||||||
|
|
||||||
|
return elements
|
||||||
|
except Exception:
|
||||||
|
dump_html(driver)
|
||||||
|
take_snapshot(driver)
|
||||||
|
driver.close()
|
||||||
|
driver.quit()
|
||||||
|
raise Exception(f"Unable to get elements of {by} value: {key}")
|
||||||
|
|
||||||
|
|
||||||
def execute_selenium_elements(driver, timeout, elements):
|
def execute_selenium_elements(driver, timeout, elements):
|
||||||
try:
|
try:
|
||||||
for index, element in enumerate(elements):
|
for index, element in enumerate(elements):
|
||||||
|
@ -265,7 +291,7 @@ def solve_captcha(site_key, url, retry_attempts=3, version='v2', enterprise=Fals
|
||||||
use_proxy = False
|
use_proxy = False
|
||||||
if CAPTCHA_TESTING:
|
if CAPTCHA_TESTING:
|
||||||
print('Initializing CAPTCHA solver in dummy mode')
|
print('Initializing CAPTCHA solver in dummy mode')
|
||||||
code = "12345"
|
code = random.randint()
|
||||||
print("CAPTCHA Successful")
|
print("CAPTCHA Successful")
|
||||||
return code
|
return code
|
||||||
|
|
||||||
|
@ -340,6 +366,7 @@ def save_browser_session(driver):
|
||||||
|
|
||||||
|
|
||||||
def selenium_action_template(driver):
|
def selenium_action_template(driver):
|
||||||
|
# Data that might need to be entered during webscraping
|
||||||
info = {
|
info = {
|
||||||
"sample_field1": "sample_data",
|
"sample_field1": "sample_data",
|
||||||
"sample_field2": "sample_data",
|
"sample_field2": "sample_data",
|
||||||
|
@ -350,7 +377,7 @@ def selenium_action_template(driver):
|
||||||
{
|
{
|
||||||
"name": "Enter data for sample field 1",
|
"name": "Enter data for sample field 1",
|
||||||
"type": "input",
|
"type": "input",
|
||||||
"input": "{first_name}",
|
"input": "{sample_field1}",
|
||||||
# If a site implements canary design releases, you can place the ID for the element in the new design
|
# If a site implements canary design releases, you can place the ID for the element in the new design
|
||||||
"default": {
|
"default": {
|
||||||
# See get_element() for possible selector types
|
# See get_element() for possible selector types
|
||||||
|
@ -388,3 +415,54 @@ def selenium_action_template(driver):
|
||||||
|
|
||||||
# Execute the selenium actions
|
# Execute the selenium actions
|
||||||
execute_selenium_elements(driver, 8, elements)
|
execute_selenium_elements(driver, 8, elements)
|
||||||
|
|
||||||
|
# Sample task for Google search
|
||||||
|
|
||||||
|
|
||||||
|
def google_search(driver, search_term):
|
||||||
|
info = {
|
||||||
|
"search_term": search_term,
|
||||||
|
}
|
||||||
|
|
||||||
|
elements = [
|
||||||
|
{
|
||||||
|
"name": "Type in search term",
|
||||||
|
"type": "input",
|
||||||
|
"input": "{search_term}",
|
||||||
|
"default": {
|
||||||
|
"type": "xpath",
|
||||||
|
"key": '//*[@id="APjFqb"]'
|
||||||
|
},
|
||||||
|
"failover": {
|
||||||
|
"type": "xpath",
|
||||||
|
"key": '//*[@id="APjFqb"]'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Press enter",
|
||||||
|
"type": "input_enter",
|
||||||
|
"default": {
|
||||||
|
"type": "xpath",
|
||||||
|
"key": '//*[@id="APjFqb"]'
|
||||||
|
},
|
||||||
|
"failover": {
|
||||||
|
"type": "xpath",
|
||||||
|
"key": '//*[@id="APjFqb"]'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
site_form_values = {}
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
if 'input' in element and '{' in element['input']:
|
||||||
|
a = element['input'].strip('{}')
|
||||||
|
if a in info:
|
||||||
|
value = info[a]
|
||||||
|
if callable(value):
|
||||||
|
if a not in site_form_values:
|
||||||
|
site_form_values[a] = value()
|
||||||
|
value = site_form_values[a]
|
||||||
|
element['input'] = str(value)
|
||||||
|
|
||||||
|
execute_selenium_elements(driver, 8, elements)
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 148 KiB After Width: | Height: | Size: 153 KiB |
Loading…
Reference in a new issue