Python Selenium Automation

I have been approached by a non-profit organization that needed some help with a website. I created the initial site for the individual, however he could not find someone to create posts for his organization. I had realized that the individual was already creating posts on social media and all I had to do was take the posts and add them to his website, so I created a slightly automated solution to handle this business.

Python Scripts

Script 1 : Get All Post From WebSite

import time
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.common.by import By
from selenium.webdriver.common import alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium import *
from datetime import datetime as  dt
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
import re 
import os 
import post_by_api

option1 = Options()
option1.add_argument("--disable-notifications")

#Get Chrome driver
chrome_driver = webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\chromedriver_win32\chromedriver.exe", chrome_options=option1)
chrome_driver.maximize_window()

def main():
    get_social()
    get_events()

#Open chrome to login to facebook
def get_social():
    # chrome_driver.implicitly_wait(2)
    time.sleep(2)
    chrome_driver.get("http://www.somesitethatallowsscraping.com")

    email_input = chrome_driver.find_element_by_name('email')
    pass_input = chrome_driver.find_element_by_name('pass')
    email_input.send_keys("UserName")
    pass_input.send_keys("Password")
    chrome_driver.find_element_by_name("login").click()
    time.sleep(2)

#Get all event urls 
def get_events():
    chrome_driver.get("https://www.somesitethatallowsscraping.com/groups/some#/events")
    # chrome_driver.implicitly_wait(2)

    get_regex = False
	
	#Regex For Event URLS
    digits = "\d+"
    beg = r"https://www.somesitethatallowsscraping.com/events/"
    end = r"/?acontext=%7B%22event_action_history%22%3A[%7B%22surface%22%3A%22group%22%7D]%7D"
    beg = re.escape(beg)
    end = re.escape(end)
    # re.compile("(SUN|MON|TUE|WED|THURS|FRI|SAT[^0-9]*DAY,)(\s*JAN|FEB|MARCH|APRIL|MAY|JUNE|JULY|AUG|SEPT|OCT|NOV|DEC)")

    try:
        chrome_driver.find_element_by_xpath(".//div[@class='n1l5q3vz']/div/div/div/div/span/span").click()
    except NoSuchElementException:
        clickable_element = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//div[@class='n1l5q3vz']/div/div/div/div/span/span"))
        )
        clickable_element.click()


    urls = []

    time.sleep(2)
    for lnk in chrome_driver.find_elements_by_tag_name("a"):
        link = lnk.get_attribute("href")
        if link == "Link":
            get_regex = True
        if get_regex == True: 
            re_link = beg + digits + end
            if match_string(link, re_link) is not None:
                if link not in urls: 
                    urls.append(link)
                else:
                    continue

    for url in urls:
        chrome_driver.get(url)
        time.sleep(2)
        if "This event was canceled" in chrome_driver.page_source:
            continue
        get_event_data()
        print(url)

#Get all event data and prepare data to transfer to wordpress using wp-json api
def get_event_data():
	#Get all the page elements that hold the required data
    try:
        chrome_driver.find_element_by_xpath(".//div[@class='p75sslyk']/span/div/div").click()
    except NoSuchElementException:
        clickable_element = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//div[@class='p75sslyk']/span/div/div"))
        )
        clickable_element.click()

    try:
        header_info = chrome_driver.find_elements_by_xpath(".//div[@class='bi6gxh9e aov4n071']/h2/span")
    except NoSuchElementException:
        header_info = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//div[@class='p75sslyk']/span/div/div"))
        )

    try:
        header_info1 = chrome_driver.find_element_by_xpath(".//div[@class='bi6gxh9e aov4n071']/span/span")
    except NoSuchElementException: 
        header_info = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//div[@class='bi6gxh9e aov4n071']/span/span"))
        )
        
    try:
        details = chrome_driver.find_element_by_xpath(".//div[@class='w0hvl6rk qjjbsfad']/h2/span")
    except NoSuchElementException:
        details = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//div[@class='w0hvl6rk qjjbsfad']/h2/span"))
        )

    try:
        details_para = chrome_driver.find_element_by_xpath(".//div[@class='p75sslyk']/span/div")
    except NoSuchElementException:
        details_para = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//div[@class='p75sslyk']/span/div"))
        )

    try:
        details_img = chrome_driver.find_element_by_xpath(".img")
    except NoSuchElementException: 
        details_img = WebDriverWait(chrome_driver, 10).until(
        EC.presence_of_element_located((By.XPATH, ".//img[@class='i09qtzwb rq0escxv n7fi1qx3 pmk7jnqg j9ispegn kr520xx4 datstx6m k4urcfbm']"))
        )

    details_para_text = details_para.text
    s = str(details_para_text).replace("See less", "")

	
    title = str(header_info[1].text).replace("/", "")
    if get_date(str(header_info[0].text)) is True:

        text ='''
            <html>
                <body>
                    <div class="post_cont1">
                    <h2 class="post_h2_headers1">Time And Date : ''' + header_info[0].text + '''</h2>
                    <h2 class="post_h2_headers1">Event Name : ''' + title + '''</h2>
                    <h2 class="post_h2_headers1">Event Place : ''' + header_info1.text + '''</h2>
                    </div>
                    <h3 class="post_h3_headers1">About Event</h3>
                    <p class="post_para1">''' + s + '''</p>

                </body>
            </html>

            '''
        print(text)

#This function calls the second script and sends the html to be output as a json string to wordpress
        post_by_api.send_post(text, title)
    else:
        print("outdated!!!")

def match_string(string, re_link):
    r2 = re_link
    return(re.match(re_link, string))
    
#Check date and make sure date of event is not older then current date.
def get_date(string):

    r1_weekday = re.compile("(Mo(n(day,\s+)?)?|Tu(e(sday)?)?|We(d(nesday)?)?|Th(u(rsday)?)?|Fr(i(day)?)?|Sa(t(urday)?)?|Su(n(day)?)?)", re.IGNORECASE)
    r2_month = re.compile("((Jan(?:uary)?)|(Feb(?:ruary)?)|(Mar(?:ch)?)|(Apr(?:il)?)|May|June|(Jul(?:y)?)|(Aug(?:ust)?)|(Sep(?:tember)?)|(Oct(?:ober)?)|(Nov(?:ember)?)|(Dec(?:ember)?))", re.IGNORECASE)
    r3_daymonth = re.compile("\s+\d{1,2}(,?)\s+")
    r4_year = re.compile("\s+20\d{2}\s+")
    r5_time = re.compile("AT\s+.*")
    weekday = re.search(r1_weekday, string).group(0)
    month = re.search(r2_month, string).group(0)
    daymonth = re.search(r3_daymonth, string).group(0)
    year = re.search(r4_year, string).group(0)
    time = re.search(r5_time, string).group(0)

    monthDict = {'JAN':1, 'JANUARY':1, 'FEB':2, 'FEBUARY':2, 'MAR':3, 'MARCH':3, 'APR':4, 'APRIL':4, 'MAY':5, 'JUN':6, 'JUNE':6,
        'JUL':7, 'JULY':7, 'AUG':8, 'AUGUST':8, 'SEP':9, 'SEPTEMBER':9, 'OCT':10, 'OCTOBER':10, 'NOV':11, 'NOVEMBER':11, 'DEC':12, 'DECEMBER':12}

    month = monthDict[str(month)]
    daymonth = str(daymonth).replace(",", "")
    print(int(daymonth))
   
    if still_time(year, month, daymonth) == True:
        return True

#Check if there is still time from today to the date of the event
def still_time(year, month, daymonth):
    print(str(year) + " " + str(month) + " " + str(daymonth))
    if int(year) > int(dt.now().strftime("%Y")):
        return True
    elif int(year) == int(dt.now().strftime("%Y")) and int(month) > int(dt.now().strftime("%m")):
        return True
    elif int(month) == int(dt.now().strftime("%m")) and int(daymonth) > int(dt.now().strftime("%d")):  
        return True

main()

Script 2 : Python Script to Post Data To WordPress

from datetime import date, datetime as  dt
import requests
import json
import base64

def send_post(text, title):
    url = 'https://www.website.com/wp-json/wp/v2'

    user = "UserName"
    password = "Password"

    creds = user + ':' + password

    token = base64.b64encode(creds.encode())

    header = {'Authorization': 'Basic ' + token.decode('utf-8')}
    
    post_data = {
        'date': dt.now().strftime("%Y-%m-%dT%I:%M:%S"),
        'title': title,
        'content': text,
        'categories': '4',
        'status': 'publish'

    }

    r = requests.post(url + '/posts', headers=header, json=post_data)

    print(r)

Leave a Reply

Your email address will not be published. Required fields are marked *