Learn as if you will live forever, live like you will die tomorrow.

   +1 555 87 89 56   80 Harrison Lane, FL 32547

Home파이썬 크롤링과 워드프레스 원격 포스팅을 활용한 마케팅 사이트 패스트 빌드WORKS파이썬 크롤링과 워드프레스 원격 포스팅을 활용한 마케팅 사이트 패스트 빌드

파이썬 크롤링과 워드프레스 원격 포스팅을 활용한 마케팅 사이트 패스트 빌드

활용 기술

파이썬, 워드프레스 및 플러그인 설치, 독립호스팅 서버(nginx, MySQL, PHP 등)

수집항목은 기본정보만 수집(아래 소스 참조)

#-*- coding:UTF-8 -*-

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import time, random
import pandas as pd

import time
from PIL import Image
from bs4 import BeautifulSoup
import requests
import json
from urllib.request import Request, urlopen
import pandas as pd
import xml.etree.ElementTree as ET

from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.compat import xmlrpc_client
from wordpress_xmlrpc.methods import media, posts

import requests
import json
import base64
from datetime import datetime

options = webdriver.ChromeOptions() 
#options.add_argument("start-maximized")
options.add_argument("--log-level=3")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)


""" CONFIG start """
now = datetime.now()
now = datetime.strftime(now, '%Y-%m-%d %H:%I')

"""
url = "https://워드프레스가 설치된 도메인주소/wp-json/wp/v2/posts"
user = "크롤러" # 워드프레스 사용자 (글쓰기 권한이상 사용자)
password = "XXXX XXXX XXXX XXXX" # 플러그인 Application Passwords로 비번생성
credentials = user + ':' + password
token = base64.b64encode(credentials.encode())
"""

header = {'Authorization': 'Basic '+token.decode('utf-8')}

KEY1 = "xxxxxx" # 영화진흥위원회에서 key 발급
KEY2 = "" #문화공공데이터광장key
PAGE = [*range(100,102, 1)]
PERPAGE = 10
MOVIECATE = {
	"공연": "658",
	"드라마": "641",
	"서부극(웨스턴)": "657",
	"다큐멘터리": "655",
	"사극": "653",
	"뮤지컬": "656",
	"미스터리": "650",
	"범죄": "640",
	"SF": "646",
	"가족": "651",
	"공포(호러)": "645",
	"기타": "654",
	"성인물(에로)": "647",
	"스릴러": "644",
	"애니메이션": "649",
	"액션": "638",
	"어드벤처": "649",
	"전쟁": "652",
	"코미디": "642",
	"판타지": "648",
	"멜로/로맨스": "643",
    "영화소개": "635",
    "영화장르": "637",
}

""" CONFIG end """

def download_image(imageurl):
	# Download Image	

	response = requests.get(imageurl)		
	image_name = imageurl.split("/")	

	try:
		if 'https://www.kobis.or.kr/common/mast/movie' in imageurl:
			image_name = image_name[8]
		elif 'https://www.kobis.or.kr/upload/up_img/cleansing' in imageurl:
			image_name = image_name[9]
		file = open(f"{image_name}", "wb")
		file.write(response.content)
		file.close()
		# Convert Image Quality
		image_file = Image.open(f"{image_name}")
		image_file = image_file.convert('RGB')
		image_file.save(f"small_{image_name}", quality=70)
	except:
		pass
	

def delay():
	time.sleep(random.randint(2, 3))

readexcel = pd.read_excel('Movie_CD_list.xlsx', usecols = "A", engine='openpyxl')
#print(readexcel)
arrsize = readexcel.size

movieCdx = readexcel['movieCd'].tolist()

for cd in movieCdx:
	delay()
	tags = list()
        
	driver.get('https://www.kobis.or.kr/kobis/business/mast/mvie/searchMovieList.do?dtTp=movie&dtCd='+str(cd))
	summary = '<div class="p-3">'

	image = driver.find_element(By.XPATH, '//a[@class="fl thumb"]').get_attribute('href')
	try:
		synosis = driver.find_element(By.XPATH, '//p[@class="desc_info"]').text
	except:
		synosis = '-'

	print(synosis)

	delay()
	detailurl = 'http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json?key='+ str(KEY1) +'&movieCd='+ str(cd)     
	driver.get(detailurl)    
	movieDetail = driver.find_element(By.XPATH, '/html/body/pre').text
	
	parsed_json = json.loads(movieDetail)
	movieNm = parsed_json['movieInfoResult']['movieInfo']['movieNm']
	movieNmEn = parsed_json['movieInfoResult']['movieInfo']['movieNmEn']
	prdtYear = parsed_json['movieInfoResult']['movieInfo']['prdtYear']
	openDt = parsed_json['movieInfoResult']['movieInfo']['openDt']
	typeNm = parsed_json['movieInfoResult']['movieInfo']['typeNm']
	prdtStatNm = parsed_json['movieInfoResult']['movieInfo']['prdtStatNm']
	
	nations_L = list()
	nations = parsed_json['movieInfoResult']['movieInfo']['nations']#list ['nationNm']
	for n in nations:
		nations_L.append(n['nationNm'])

	genreNm_L = list()
	genre_IDx = list()

	genre_IDx.append(MOVIECATE['영화소개'])
	genre_IDx.append(MOVIECATE['영화장르'])
	
	genreNm = parsed_json['movieInfoResult']['movieInfo']['genres']#list['genreNm']
	for n in genreNm:
		genreNm_L.append(n['genreNm'])
		genre_IDx.append(MOVIECATE[n['genreNm']])
		tags.append(n['genreNm'])
	
	directors_L = list()
	directors = parsed_json['movieInfoResult']['movieInfo']['directors']#list ['peopleNm']
	for n in directors:
		directors_L.append(n['peopleNm'])
	
	actors_L = list()
	actors = parsed_json['movieInfoResult']['movieInfo']['actors']#list
	for n in actors:
		actors_L.append(n['peopleNm'])
	
	jo_nations = ",".join(nations_L)
	jo_genreNm = ",".join(genreNm_L)
	jo_directors = ",".join(directors_L)
	jo_actors = ",".join(actors_L)    

	#print(parsed_json)

		
	
	summary = '<div class="p-3">'

	summary += '<table class="table">'
	summary += '<tr>'
	summary += '<th class="table-light p-2"><h3>시놉시스</h3>' 
	summary += '</th>'
	summary += '</tr>' 
	summary += '<tr>'
	summary += '<td class="table-warning p-4 fs-6">' + synosis
	summary += '</td>'
	summary += '</tr>'    	
	summary += '</table>'

	summary += '<table class="table table-striped">'
	summary += '<tr>'
	summary += '<td style="width:20%;">' + '구분'
	summary += '<td>' + '정보'
	summary += '</tr>'    
	summary += '<tr><td><span class="summary_head">영화코드 </span></td><td><span class="summary_val"><b>'+ str(cd) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">영화명(국문) </span></td><td><span class="summary_val"><b>'+ str(movieNm) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">영화명(영문) </span></td><td><span class="summary_val"><b>'+ str(movieNmEn) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">제작연도 </span></td><td><span class="summary_val"><b>'+ str(prdtYear) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">개봉일 </span></td><td><span class="summary_val"><b>'+ str(openDt) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">영화유형 </span></td><td><span class="summary_val"><b>'+ str(typeNm) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">제작상태 </span></td><td><span class="summary_val"><b>'+ str(prdtStatNm) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">제작국가<br>(전체) </span></td><td><span class="summary_val"><b>'+ str(jo_nations) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">영화장르<br>(전체) </span></td><td><span class="summary_val"><b>'+ str(jo_genreNm) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">영화감독 </span></td><td><span class="summary_val"><b>'+ str(jo_directors) + '</b></span><br></td></tr>'
	summary += '<tr><td><span class="summary_head">배우 </span></td><td><span class="summary_val"><b>'+ str(jo_actors) + '</b></span><br></td></tr>'    
	summary += '</table>'
	summary += '</div>'
	summary += '<br>'
	

		
	#print(image)
	post = {
	'title'      : str(movieNm)+'('+str(movieNmEn)+')',
	'status'     : 'publish', 
	'content'    : summary,
	'categories' : genre_IDx,
	'slug'   : movieNm,
	'featured_image': image,
	}

	#responce = requests.post(url , headers=header, json=post)
	#data = responce.json()
	#print(data)
	
	
	# set to the path to your file
	download_image(image)

	client = Client("https://도메인/xmlrpc.php", "워드프레스 사용자(글쓰기가능)", "비밀번호")
			
	spliturl = image.split("/")
	
	if 'https://www.kobis.or.kr/common/mast/movie' in image:
		try:
			spliturl = spliturl[8]
			filename = 'small_' + spliturl
		except:
			filename = 'noimage289.jpg'
	elif 'https://www.kobis.or.kr/upload/up_img/cleansing' in image:
		try:
			spliturl = spliturl[9]
			filename = 'small_' + spliturl
		except:
			filename = 'noimage289.jpg'
	else:			
		filename = 'noimage289.jpg'

	# prepare metadata
	data = {
			'name': filename,
			'type': 'image/jpeg',  # mimetype
	}

	# read the binary file and let the XMLRPC library encode it into base64
	with open(filename, 'rb') as img:
			data['bits'] = xmlrpc_client.Binary(img.read())

	response = client.call(media.UploadFile(data))        
	attachment_id = response['id']

	tags.append(movieNm)
	tags.append(movieNmEn)
	tags.append('영화감독')
	for director in directors_L:
		tags.append(director)
	tags.append('출연배우')
	s=0
	for actor in actors_L:
		tags.append(actor)
		if s == 10:
			break
		s+=1

	if filename != 'noimage289.jpg':	
		try:		
			post = WordPressPost()
			post.title = movieNm+'('+movieNmEn+')'
			post.slug = movieNm
			post.content = summary
			post.terms_names = {
					'category': genreNm_L,
					'post_tag': tags,
			}
			post.post_status  = 'publish'
			post.thumbnail  = attachment_id
			post.id = client.call(posts.NewPost(post))
		except:pass

답글 남기기

이메일 주소는 공개되지 않습니다. 필수 필드는 *로 표시됩니다

© UiCore 2025. All Rights Reserved.