활용 기술
파이썬, 워드프레스 및 플러그인 설치, 독립호스팅 서버(nginx, MySQL, PHP 등)
수집항목은 기본정보만 수집(아래 소스 참조)
#-*- coding:UTF-8 -*-
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import time, random
import pandas as pd
import time
from PIL import Image
from bs4 import BeautifulSoup
import requests
import json
from urllib.request import Request, urlopen
import pandas as pd
import xml.etree.ElementTree as ET
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.compat import xmlrpc_client
from wordpress_xmlrpc.methods import media, posts
import requests
import json
import base64
from datetime import datetime
options = webdriver.ChromeOptions()
#options.add_argument("start-maximized")
options.add_argument("--log-level=3")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
""" CONFIG start """
now = datetime.now()
now = datetime.strftime(now, '%Y-%m-%d %H:%I')
"""
url = "https://워드프레스가 설치된 도메인주소/wp-json/wp/v2/posts"
user = "크롤러" # 워드프레스 사용자 (글쓰기 권한이상 사용자)
password = "XXXX XXXX XXXX XXXX" # 플러그인 Application Passwords로 비번생성
credentials = user + ':' + password
token = base64.b64encode(credentials.encode())
"""
header = {'Authorization': 'Basic '+token.decode('utf-8')}
KEY1 = "xxxxxx" # 영화진흥위원회에서 key 발급
KEY2 = "" #문화공공데이터광장key
PAGE = [*range(100,102, 1)]
PERPAGE = 10
MOVIECATE = {
"공연": "658",
"드라마": "641",
"서부극(웨스턴)": "657",
"다큐멘터리": "655",
"사극": "653",
"뮤지컬": "656",
"미스터리": "650",
"범죄": "640",
"SF": "646",
"가족": "651",
"공포(호러)": "645",
"기타": "654",
"성인물(에로)": "647",
"스릴러": "644",
"애니메이션": "649",
"액션": "638",
"어드벤처": "649",
"전쟁": "652",
"코미디": "642",
"판타지": "648",
"멜로/로맨스": "643",
"영화소개": "635",
"영화장르": "637",
}
""" CONFIG end """
def download_image(imageurl):
# Download Image
response = requests.get(imageurl)
image_name = imageurl.split("/")
try:
if 'https://www.kobis.or.kr/common/mast/movie' in imageurl:
image_name = image_name[8]
elif 'https://www.kobis.or.kr/upload/up_img/cleansing' in imageurl:
image_name = image_name[9]
file = open(f"{image_name}", "wb")
file.write(response.content)
file.close()
# Convert Image Quality
image_file = Image.open(f"{image_name}")
image_file = image_file.convert('RGB')
image_file.save(f"small_{image_name}", quality=70)
except:
pass
def delay():
time.sleep(random.randint(2, 3))
readexcel = pd.read_excel('Movie_CD_list.xlsx', usecols = "A", engine='openpyxl')
#print(readexcel)
arrsize = readexcel.size
movieCdx = readexcel['movieCd'].tolist()
for cd in movieCdx:
delay()
tags = list()
driver.get('https://www.kobis.or.kr/kobis/business/mast/mvie/searchMovieList.do?dtTp=movie&dtCd='+str(cd))
summary = '<div class="p-3">'
image = driver.find_element(By.XPATH, '//a[@class="fl thumb"]').get_attribute('href')
try:
synosis = driver.find_element(By.XPATH, '//p[@class="desc_info"]').text
except:
synosis = '-'
print(synosis)
delay()
detailurl = 'http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json?key='+ str(KEY1) +'&movieCd='+ str(cd)
driver.get(detailurl)
movieDetail = driver.find_element(By.XPATH, '/html/body/pre').text
parsed_json = json.loads(movieDetail)
movieNm = parsed_json['movieInfoResult']['movieInfo']['movieNm']
movieNmEn = parsed_json['movieInfoResult']['movieInfo']['movieNmEn']
prdtYear = parsed_json['movieInfoResult']['movieInfo']['prdtYear']
openDt = parsed_json['movieInfoResult']['movieInfo']['openDt']
typeNm = parsed_json['movieInfoResult']['movieInfo']['typeNm']
prdtStatNm = parsed_json['movieInfoResult']['movieInfo']['prdtStatNm']
nations_L = list()
nations = parsed_json['movieInfoResult']['movieInfo']['nations']#list ['nationNm']
for n in nations:
nations_L.append(n['nationNm'])
genreNm_L = list()
genre_IDx = list()
genre_IDx.append(MOVIECATE['영화소개'])
genre_IDx.append(MOVIECATE['영화장르'])
genreNm = parsed_json['movieInfoResult']['movieInfo']['genres']#list['genreNm']
for n in genreNm:
genreNm_L.append(n['genreNm'])
genre_IDx.append(MOVIECATE[n['genreNm']])
tags.append(n['genreNm'])
directors_L = list()
directors = parsed_json['movieInfoResult']['movieInfo']['directors']#list ['peopleNm']
for n in directors:
directors_L.append(n['peopleNm'])
actors_L = list()
actors = parsed_json['movieInfoResult']['movieInfo']['actors']#list
for n in actors:
actors_L.append(n['peopleNm'])
jo_nations = ",".join(nations_L)
jo_genreNm = ",".join(genreNm_L)
jo_directors = ",".join(directors_L)
jo_actors = ",".join(actors_L)
#print(parsed_json)
summary = '<div class="p-3">'
summary += '<table class="table">'
summary += '<tr>'
summary += '<th class="table-light p-2"><h3>시놉시스</h3>'
summary += '</th>'
summary += '</tr>'
summary += '<tr>'
summary += '<td class="table-warning p-4 fs-6">' + synosis
summary += '</td>'
summary += '</tr>'
summary += '</table>'
summary += '<table class="table table-striped">'
summary += '<tr>'
summary += '<td style="width:20%;">' + '구분'
summary += '<td>' + '정보'
summary += '</tr>'
summary += '<tr><td><span class="summary_head">영화코드 </span></td><td><span class="summary_val"><b>'+ str(cd) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">영화명(국문) </span></td><td><span class="summary_val"><b>'+ str(movieNm) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">영화명(영문) </span></td><td><span class="summary_val"><b>'+ str(movieNmEn) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">제작연도 </span></td><td><span class="summary_val"><b>'+ str(prdtYear) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">개봉일 </span></td><td><span class="summary_val"><b>'+ str(openDt) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">영화유형 </span></td><td><span class="summary_val"><b>'+ str(typeNm) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">제작상태 </span></td><td><span class="summary_val"><b>'+ str(prdtStatNm) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">제작국가<br>(전체) </span></td><td><span class="summary_val"><b>'+ str(jo_nations) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">영화장르<br>(전체) </span></td><td><span class="summary_val"><b>'+ str(jo_genreNm) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">영화감독 </span></td><td><span class="summary_val"><b>'+ str(jo_directors) + '</b></span><br></td></tr>'
summary += '<tr><td><span class="summary_head">배우 </span></td><td><span class="summary_val"><b>'+ str(jo_actors) + '</b></span><br></td></tr>'
summary += '</table>'
summary += '</div>'
summary += '<br>'
#print(image)
post = {
'title' : str(movieNm)+'('+str(movieNmEn)+')',
'status' : 'publish',
'content' : summary,
'categories' : genre_IDx,
'slug' : movieNm,
'featured_image': image,
}
#responce = requests.post(url , headers=header, json=post)
#data = responce.json()
#print(data)
# set to the path to your file
download_image(image)
client = Client("https://도메인/xmlrpc.php", "워드프레스 사용자(글쓰기가능)", "비밀번호")
spliturl = image.split("/")
if 'https://www.kobis.or.kr/common/mast/movie' in image:
try:
spliturl = spliturl[8]
filename = 'small_' + spliturl
except:
filename = 'noimage289.jpg'
elif 'https://www.kobis.or.kr/upload/up_img/cleansing' in image:
try:
spliturl = spliturl[9]
filename = 'small_' + spliturl
except:
filename = 'noimage289.jpg'
else:
filename = 'noimage289.jpg'
# prepare metadata
data = {
'name': filename,
'type': 'image/jpeg', # mimetype
}
# read the binary file and let the XMLRPC library encode it into base64
with open(filename, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = client.call(media.UploadFile(data))
attachment_id = response['id']
tags.append(movieNm)
tags.append(movieNmEn)
tags.append('영화감독')
for director in directors_L:
tags.append(director)
tags.append('출연배우')
s=0
for actor in actors_L:
tags.append(actor)
if s == 10:
break
s+=1
if filename != 'noimage289.jpg':
try:
post = WordPressPost()
post.title = movieNm+'('+movieNmEn+')'
post.slug = movieNm
post.content = summary
post.terms_names = {
'category': genreNm_L,
'post_tag': tags,
}
post.post_status = 'publish'
post.thumbnail = attachment_id
post.id = client.call(posts.NewPost(post))
except:pass