Jsonファイルを元に購入リストがあるか判定

 vim ocr_list.py

でファイルを作成

import json
settings_json = open('settings.json', 'r', encoding='utf_8')
settings = json.load(settings_json)

# 公式サイトからpdfリンク一覧取得
def get_urls():
  import requests
  from bs4 import BeautifulSoup

  params = { settings['url_params_name']: settings['url_params_value'] }
  load_url = settings['url']
  html = requests.get(load_url, params=params)
  soup = BeautifulSoup(html.text, 'html.parser')

  flyer_list = soup.find_all('table')
  url_list = []
  for flyer in flyer_list:
    # 日付
    date = flyer.find('div', {'class': 'sale'}).find('a').get_text(strip=True).replace(' ', '').replace('（', '(').replace('）', ')')
    
    # PDF(表)
    omote_url = flyer.find('a', {'title': 'PDFオモテ'})['href']
    omote = {}
    omote['date'] = date
    omote['url'] = settings['url_stem'] + omote_url.replace('../', '')
    url_list.append(omote)

    # PDF(裏)
    if flyer.find('a', {'title': 'PDFウラ'}):
      ura_url = flyer.find('a', {'title': 'PDFウラ'})['href'] 
      ura = {}
      ura['date'] = date
      ura['url'] = settings['url_stem'] + ura_url.replace('../', '')
      url_list.append(ura)

  return url_list

# 未解析のチラシURLを取得
def get_new_urls(url_list):
  # urls.txt読込
  old_urls = []
  with open('urls.txt', 'r') as f:
    old_urls = f.read().splitlines()

  new_urls = []
  urls_text = []
  count = 0
  for url_info in url_list:
    urls_text.append(url_info['url'] + '\n')

    if url_info['url'] not in old_urls:
      # 新規
      url_info['number'] = count
      new_urls.append(url_info)
      count += 1
  
  # urls.txt書込
  f = open('urls.txt', 'w')
  f.writelines(urls_text)
  f.close()

  return new_urls

# 未解析のpdfをDL
def dl_pdfs(new_url_list):
  import urllib.request
  import time

  pdf_list = []
  for url_info in new_url_list:
    # 表
    file_name = f'pdf/{url_info["number"]}.pdf'
    urllib.request.urlretrieve(url_info['url'], file_name)
    url_info['pdf_path'] = file_name

    time.sleep(2)

    pdf_list.append(url_info)

  return pdf_list

# PDFをJPGに変換
def pdf_to_jpeg(path):
  import os
  from pathlib import Path
  from pdf2image import convert_from_path

  # poppler/binを環境変数PATHに追加する
  poppler_dir = Path(__file__).parent.absolute() / 'lib/poppler/bin'
  os.environ['PATH'] += os.pathsep + str(poppler_dir)

  image_paths = []

  pdf_path = Path(path)
  # PDF -> Image に変換（150dpi）
  pages = convert_from_path(str(pdf_path), 150)

  # 画像ファイルを１ページずつ保存
  image_dir = Path('./jpg')
  for i, page in enumerate(pages):
    file_name = pdf_path.stem + '_{:02d}'.format(i + 1) + '.jpeg'
    image_path = image_dir / file_name
    # JPEGで保存
    page.save(str(image_path), 'JPEG')
    image_paths.append(image_path)

  return image_paths

# 複数チラシをJPGに変換
def pdfs_to_jpeg(pdf_list):
  jpg_list = []
  for pdf_info in pdf_list:
    jpg_info = pdf_info
    # 表
    omote_image_paths = pdf_to_jpeg(pdf_info['pdf_path'])
    jpg_info['image_paths'] = omote_image_paths

    jpg_list.append(jpg_info)

  return jpg_list

# OCR
def detect_text(image_paths):
  from google.cloud import vision
  import io
  client = vision.ImageAnnotatorClient()

  all_text = ''

  for image_path in image_paths:
    with io.open(image_path, 'rb') as image_file:
      content = image_file.read()

    image = vision.Image(content=content)
    
    # pylint: disable=no-member
    response = client.text_detection(image=image)
    texts = response.text_annotations

    for text in texts:
      all_text += str(text.description)

    if response.error.message:
      raise Exception(
        '{}\nFor more info on error messages, check: '
        'https://cloud.google.com/apis/design/errors'.format(
          response.error.message))

  return all_text

# キーワード検索
def search_words(all_text):
  hitwords = []
  for keyword in settings["keywords"]:
    if keyword in all_text:
      hitwords.append(keyword)

  return hitwords

# キーワードに引っかかったチラシ取得
def get_target_flyers(jpg_list):
  result = []
  for jpg_info in jpg_list:
    all_text = detect_text(jpg_info['image_paths'])
    hitwords = search_words(all_text)

    if len(hitwords) != 0:
      hit = jpg_info
      hit['hitwords'] = hitwords
      result.append(hit)

  return result

# Slack通知
def slack_notice(results):
  import slackweb
  slack = slackweb.Slack(url=settings['slack_webhook_url'])
  for result in results:
    text = f'{result["date"]} チラシ掲載商品：{",".join(result["hitwords"])}\n<{result["url"]}|チラシを見る>'
    slack.notify(text=text)

### FlyerOCR ###
import shutil
import os
os.makedirs('pdf/', exist_ok=True)
os.makedirs('jpg/', exist_ok=True)

url_list = get_urls()
new_url_list = get_new_urls(url_list)
pdf_list = dl_pdfs(new_url_list)
jpg_list = pdfs_to_jpeg(pdf_list)
results = get_target_flyers(jpg_list)
slack_notice(results)

shutil.rmtree('pdf/')
shutil.rmtree('jpg/')

のコードを書き換える

text_detection
は
画像内のテキスト要素を検出するのに適しており、一般的なOCRタスクに使用される

これを
document_text_detection
を使い
文書のスキャンや複雑なレイアウトを持つ画像に対して適しており、より詳細なテキスト情報を取得できるようにする

# OCR
def detect_text(image_paths):
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    all_text = ''

    for image_path in image_paths:
        with io.open(image_path, 'rb') as image_file:
            content = image_file.read()

        image = vision.Image(content=content)

        # document_text_detectionを使用
        response = client.document_text_detection(image=image)
        # FullTextAnnotationを使用して文書全体のテキストを取得
        full_text_annotation = response.full_text_annotation

        # テキストの抽出
        all_text += full_text_annotation.text

        if response.error.message:
            raise Exception(
                '{}\nFor more info on error messages, check: '
                'https://cloud.google.com/apis/design/errors'.format(
                    response.error.message))

    return all_text

とりあえず実行できるか試すので

# 例として実行
if __name__ == "__main__":
    image_paths = ["images/combined_image_20240802.jpg"]
    extracted_text = detect_text(image_paths)
    print(extracted_text)

この結果はそのままだとターミナル表示なので
テキストファイルに保存する

全文の中で検出成功しているのは
麻婆豆腐
キッチンタオル

全文は

8/
2
金曜日
本日限定!
とろける
バラエティパック
1000
T
スライスカード
サイズ
創品
創業祭特に
$10
ポイント
焼そば
うどん
$10
ポイント
129 168 159
プレーンヨーグル
ビヒダス
400
イチビキ
EE97
128
上級
ヒラス
-10
139円
創業祭特価
CRUNKY
Ghana
ON 161
198
213
30
128 98 9869
136
105円
645
8/2はおやつの日
BLACKENT
ADREN
おやつフェア カメ
強力小麦を
POTAH
198
64
191
78
B
カメリヤス
強力小麦粉」
298
321
ロリエ
きれいガード
ソフィ
$20
198
PARA 2139
税込74円]
かに
麻婆豆腐
168
8x4 パウダースプレー
88
181
ジョイコン
2
100-498
100 100
260 root
1,180
OPIE
ボディフィット
ガード
8
50
270
378
171
CARE
UF.O.
3
UFO
パワフル吸収
キッチンタオル
くらしりえね
ミネピア
「キッチンタオル
$149
10% 163
18 U
3759
8
300
$20
248
10272
$100
8398
ロール
ダブルメロン
38 萬
14952371
698
798
580
530
2.836
58
458
10 2.066
オフェルミン
2080
=.098
767
7=1080]
10 305P
TONA 415

となっているが
画像と重なっている文字は読み取りが苦手みたい
単純に解像度の問題かもしれないが

とりあえず
麻婆豆腐
キッチンタオル
はできたので
これをjsonファイルに書き込んでリストと一致するか実験する

settings.jsonの中身を

{
  "keywords": [  
    "麻婆豆腐",
    "キッチンタオル",
    "keyword3"
  ]
}

にする

次にキーワードと一致したもののみ変数に格納する
これをlineで送るようにする

とりあえずコードを変更

import json
settings_json = open('settings.json', 'r', encoding='utf_8')
settings = json.load(settings_json)

# OCR
def detect_text(image_paths):
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    all_text = ''

    for image_path in image_paths:
        with io.open(image_path, 'rb') as image_file:
            content = image_file.read()

        image = vision.Image(content=content)

        # document_text_detectionを使用
        response = client.document_text_detection(image=image)
        # FullTextAnnotationを使用して文書全体のテキストを取得
        full_text_annotation = response.full_text_annotation

        # テキストの抽出
        all_text += full_text_annotation.text

        if response.error.message:
            raise Exception(
                '{}\nFor more info on error messages, check: '
                'https://cloud.google.com/apis/design/errors'.format(
                    response.error.message))

    return all_text

# キーワード検索
def search_words(all_text):
  hitwords = []
  for keyword in settings["keywords"]:
    if keyword in all_text:
      hitwords.append(keyword)

  return hitwords

# キーワードに引っかかったチラシ取得
def get_target_flyers(jpg_list):
  result = []
  for jpg_info in jpg_list:
    all_text = detect_text(jpg_info['image_paths'])
    hitwords = search_words(all_text)

    if len(hitwords) != 0:
      hit = jpg_info
      hit['hitwords'] = hitwords
      result.append(hit)

  return result


# 例として実行
if __name__ == "__main__":
    image_paths = ["images/combined_image_20240802.jpg"]
    extracted_text = detect_text(image_paths)
    print(extracted_text)

を

import json
from google.cloud import vision
import io

# 設定ファイルの読み込み
settings_json = open('settings.json', 'r', encoding='utf_8')
settings = json.load(settings_json)

# OCRで画像からテキストを抽出
def detect_text(image_paths):
    client = vision.ImageAnnotatorClient()

    all_text = ''

    for image_path in image_paths:
        with io.open(image_path, 'rb') as image_file:
            content = image_file.read()

        image = vision.Image(content=content)

        # document_text_detectionを使用して文書全体のテキストを取得
        response = client.document_text_detection(image=image)
        full_text_annotation = response.full_text_annotation

        # テキストの抽出
        all_text += full_text_annotation.text

        if response.error.message:
            raise Exception(
                '{}\nFor more info on error messages, check: '
                'https://cloud.google.com/apis/design/errors'.format(
                    response.error.message))

    return all_text

# キーワード検索
def search_words(all_text):
    hitwords = []
    for keyword in settings["keywords"]:
        if keyword in all_text:
            hitwords.append(keyword)

    return hitwords

# 例として実行
if __name__ == "__main__":
    image_paths = ["images/combined_image_20240802.jpg"]
    extracted_text = detect_text(image_paths)
    hitwords = search_words(extracted_text)
    
    # ヒットしたキーワードのみを表示
    if hitwords:
        print("マッチしたキーワード:", ", ".join(hitwords))
    else:
        print("マッチしたキーワードはありませんでした。")

に変えてみる

これで実行すると
マッチしたキーワード: 麻婆豆腐, キッチンタオル
となる

あとはキーワードにマッチした画像も一緒にLINEで送信したいので
ファイルパスを取得するようにする

そもそもの流れを復習すると
Gmailで最新のチラシのリンクを開く

日替のチラシがあるなら画像をダウンロードし統合して１つのファイルにする
clik_allget_image.py

OCRしてリストに一致しているものを取り出す
LINEで送信
line_notify.py

となっている

ただshufoo限定で店舗ごとにユニークアドレスとなっているのなら
Gmailから開く処理は不要となる

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル