I have been approached by a non-profit organization that needed some help with a website. I created the initial site for the individual, however he could not find someone to create posts for his organization. I had realized that the individual was already creating posts on social media and all I had to do was take the posts and add them to his website, so I created a slightly automated solution to handle this business.
Python Scripts
Script 1 : Get All Post From WebSite
import time
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common import alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium import *
from datetime import datetime as dt
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
import re
import os
import post_by_api
option1 = Options()
option1.add_argument("--disable-notifications")
#Get Chrome driver
chrome_driver = webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\chromedriver_win32\chromedriver.exe", chrome_options=option1)
chrome_driver.maximize_window()
def main():
get_social()
get_events()
#Open chrome to login to facebook
def get_social():
# chrome_driver.implicitly_wait(2)
time.sleep(2)
chrome_driver.get("http://www.somesitethatallowsscraping.com")
email_input = chrome_driver.find_element_by_name('email')
pass_input = chrome_driver.find_element_by_name('pass')
email_input.send_keys("UserName")
pass_input.send_keys("Password")
chrome_driver.find_element_by_name("login").click()
time.sleep(2)
#Get all event urls
def get_events():
chrome_driver.get("https://www.somesitethatallowsscraping.com/groups/some#/events")
# chrome_driver.implicitly_wait(2)
get_regex = False
#Regex For Event URLS
digits = "\d+"
beg = r"https://www.somesitethatallowsscraping.com/events/"
end = r"/?acontext=%7B%22event_action_history%22%3A[%7B%22surface%22%3A%22group%22%7D]%7D"
beg = re.escape(beg)
end = re.escape(end)
# re.compile("(SUN|MON|TUE|WED|THURS|FRI|SAT[^0-9]*DAY,)(\s*JAN|FEB|MARCH|APRIL|MAY|JUNE|JULY|AUG|SEPT|OCT|NOV|DEC)")
try:
chrome_driver.find_element_by_xpath(".//div[@class='n1l5q3vz']/div/div/div/div/span/span").click()
except NoSuchElementException:
clickable_element = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//div[@class='n1l5q3vz']/div/div/div/div/span/span"))
)
clickable_element.click()
urls = []
time.sleep(2)
for lnk in chrome_driver.find_elements_by_tag_name("a"):
link = lnk.get_attribute("href")
if link == "Link":
get_regex = True
if get_regex == True:
re_link = beg + digits + end
if match_string(link, re_link) is not None:
if link not in urls:
urls.append(link)
else:
continue
for url in urls:
chrome_driver.get(url)
time.sleep(2)
if "This event was canceled" in chrome_driver.page_source:
continue
get_event_data()
print(url)
#Get all event data and prepare data to transfer to wordpress using wp-json api
def get_event_data():
#Get all the page elements that hold the required data
try:
chrome_driver.find_element_by_xpath(".//div[@class='p75sslyk']/span/div/div").click()
except NoSuchElementException:
clickable_element = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//div[@class='p75sslyk']/span/div/div"))
)
clickable_element.click()
try:
header_info = chrome_driver.find_elements_by_xpath(".//div[@class='bi6gxh9e aov4n071']/h2/span")
except NoSuchElementException:
header_info = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//div[@class='p75sslyk']/span/div/div"))
)
try:
header_info1 = chrome_driver.find_element_by_xpath(".//div[@class='bi6gxh9e aov4n071']/span/span")
except NoSuchElementException:
header_info = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//div[@class='bi6gxh9e aov4n071']/span/span"))
)
try:
details = chrome_driver.find_element_by_xpath(".//div[@class='w0hvl6rk qjjbsfad']/h2/span")
except NoSuchElementException:
details = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//div[@class='w0hvl6rk qjjbsfad']/h2/span"))
)
try:
details_para = chrome_driver.find_element_by_xpath(".//div[@class='p75sslyk']/span/div")
except NoSuchElementException:
details_para = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//div[@class='p75sslyk']/span/div"))
)
try:
details_img = chrome_driver.find_element_by_xpath(".img")
except NoSuchElementException:
details_img = WebDriverWait(chrome_driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//img[@class='i09qtzwb rq0escxv n7fi1qx3 pmk7jnqg j9ispegn kr520xx4 datstx6m k4urcfbm']"))
)
details_para_text = details_para.text
s = str(details_para_text).replace("See less", "")
title = str(header_info[1].text).replace("/", "")
if get_date(str(header_info[0].text)) is True:
text ='''
<html>
<body>
<div class="post_cont1">
<h2 class="post_h2_headers1">Time And Date : ''' + header_info[0].text + '''</h2>
<h2 class="post_h2_headers1">Event Name : ''' + title + '''</h2>
<h2 class="post_h2_headers1">Event Place : ''' + header_info1.text + '''</h2>
</div>
<h3 class="post_h3_headers1">About Event</h3>
<p class="post_para1">''' + s + '''</p>
</body>
</html>
'''
print(text)
#This function calls the second script and sends the html to be output as a json string to wordpress
post_by_api.send_post(text, title)
else:
print("outdated!!!")
def match_string(string, re_link):
r2 = re_link
return(re.match(re_link, string))
#Check date and make sure date of event is not older then current date.
def get_date(string):
r1_weekday = re.compile("(Mo(n(day,\s+)?)?|Tu(e(sday)?)?|We(d(nesday)?)?|Th(u(rsday)?)?|Fr(i(day)?)?|Sa(t(urday)?)?|Su(n(day)?)?)", re.IGNORECASE)
r2_month = re.compile("((Jan(?:uary)?)|(Feb(?:ruary)?)|(Mar(?:ch)?)|(Apr(?:il)?)|May|June|(Jul(?:y)?)|(Aug(?:ust)?)|(Sep(?:tember)?)|(Oct(?:ober)?)|(Nov(?:ember)?)|(Dec(?:ember)?))", re.IGNORECASE)
r3_daymonth = re.compile("\s+\d{1,2}(,?)\s+")
r4_year = re.compile("\s+20\d{2}\s+")
r5_time = re.compile("AT\s+.*")
weekday = re.search(r1_weekday, string).group(0)
month = re.search(r2_month, string).group(0)
daymonth = re.search(r3_daymonth, string).group(0)
year = re.search(r4_year, string).group(0)
time = re.search(r5_time, string).group(0)
monthDict = {'JAN':1, 'JANUARY':1, 'FEB':2, 'FEBUARY':2, 'MAR':3, 'MARCH':3, 'APR':4, 'APRIL':4, 'MAY':5, 'JUN':6, 'JUNE':6,
'JUL':7, 'JULY':7, 'AUG':8, 'AUGUST':8, 'SEP':9, 'SEPTEMBER':9, 'OCT':10, 'OCTOBER':10, 'NOV':11, 'NOVEMBER':11, 'DEC':12, 'DECEMBER':12}
month = monthDict[str(month)]
daymonth = str(daymonth).replace(",", "")
print(int(daymonth))
if still_time(year, month, daymonth) == True:
return True
#Check if there is still time from today to the date of the event
def still_time(year, month, daymonth):
print(str(year) + " " + str(month) + " " + str(daymonth))
if int(year) > int(dt.now().strftime("%Y")):
return True
elif int(year) == int(dt.now().strftime("%Y")) and int(month) > int(dt.now().strftime("%m")):
return True
elif int(month) == int(dt.now().strftime("%m")) and int(daymonth) > int(dt.now().strftime("%d")):
return True
main()
Script 2 : Python Script to Post Data To WordPress
from datetime import date, datetime as dt
import requests
import json
import base64
def send_post(text, title):
url = 'https://www.website.com/wp-json/wp/v2'
user = "UserName"
password = "Password"
creds = user + ':' + password
token = base64.b64encode(creds.encode())
header = {'Authorization': 'Basic ' + token.decode('utf-8')}
post_data = {
'date': dt.now().strftime("%Y-%m-%dT%I:%M:%S"),
'title': title,
'content': text,
'categories': '4',
'status': 'publish'
}
r = requests.post(url + '/posts', headers=header, json=post_data)
print(r)
