2025-01-05 14:57:33 +00:00
# I saved this script as gist because I wrote it a lot of times.
# It has support for remembering line numbers and so on what is not used.
# It was originally written in C by me and ported to Python.
# The original application did use these features.
# Written by retoor@molodetz.nl
# This script processes text files to identify words, their positions, and their frequency of occurrence. It uses command-line arguments to query or display popular words and stores results in a SQLite database.
# Imports:
# - argparse: For handling command-line arguments
# - sqlite3: A library to control and manage SQLite databases
# - pathlib: To work with filesystem paths in an object-oriented way
# MIT License:
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import argparse
import sqlite3
import pathlib
parser = argparse . ArgumentParser ( )
parser . add_argument ( ' --find ' , type = str , required = False , default = " " )
parser . add_argument ( ' --index ' , action = ' store_true ' )
parser . add_argument ( ' --popular ' , action = ' store_true ' )
args = parser . parse_args ( )
def is_valid_char ( c ) :
return c . isalnum ( ) or c == ' _ '
def process_file ( file_path ) :
word = [ ]
word_start = - 1
word_end = 0
word_line = 0
word_alinia = 0
word_length = 0
new_line_count = 0
pos = 0
line = 1
alinia = 1
words = { }
with open ( file_path , ' r ' ) as f :
2025-01-05 15:00:38 +00:00
while c := f . read ( 1 ) :
2025-01-05 14:57:33 +00:00
pos + = 1
valid = True
if c == ' . ' :
line + = 1
valid = False
if c == ' \n ' :
new_line_count + = 1
valid = False
if not is_valid_char ( c ) :
valid = False
if not valid :
if word_start > - 1 :
word_end = pos - 1
word_length = word_end - word_start
word_str = ' ' . join ( word )
print ( f " { word_str } { word_start } { word_end } { word_length } { word_line } { word_alinia } { alinia } " )
if word_str not in words :
words [ word_str ] = 0
words [ word_str ] + = 1
word_start = - 1
word = [ ]
continue
if new_line_count > = 2 :
new_line_count = 0
alinia + = 1
word . append ( c )
if word_start == - 1 :
word_start = pos
word_line = line
word_alinia = alinia
return words
class WordDb :
def __init__ ( self , path ) :
self . path = path
self . conn = sqlite3 . connect ( path )
self . cursor = self . conn . cursor ( )
self . conn . commit ( )
self . words = { }
def reset ( self ) :
self . words = { }
self . cursor . execute ( " DROP TABLE IF EXISTS words " )
self . cursor . execute ( """
CREATE TABLE words (
word TEXT NOT NULL ,
count INTEGER NOT NULL
)
""" )
self . conn . commit ( )
def insert ( self , word , count ) :
if word not in self . words :
self . words [ word ] = count
self . cursor . execute ( " INSERT INTO words (word, count) VALUES (?, ?) " , ( word , count ) )
else :
self . words [ word ] + = count
self . cursor . execute ( " UPDATE words SET count = ? WHERE word = ? " , ( self . words [ word ] , word ) )
def commit ( self ) :
self . conn . commit ( )
def total_count ( self ) :
self . cursor . execute ( " SELECT SUM(count) FROM words " )
return self . cursor . fetchone ( ) [ 0 ]
def get ( self , word ) :
self . cursor . execute ( " SELECT count FROM words WHERE word = ? " , ( word , ) )
return self . cursor . fetchone ( ) [ 0 ]
def most_popular ( self , count ) :
self . cursor . execute ( " SELECT word, count FROM words ORDER BY count DESC LIMIT ? " , ( count , ) )
return list ( self . cursor . fetchall ( ) )
def __del__ ( self ) :
self . commit ( )
self . conn . close ( )
print ( " Database closed " )
db = WordDb ( " tags.db " )
def index ( ) :
words = { }
for f in pathlib . Path ( " logs_plain " ) . iterdir ( ) :
for key , value in process_file ( f ) . items ( ) :
db . insert ( key , value )
from pprint import pprint as pp
pp ( db . most_popular ( 100 ) )
db . commit ( )
print ( len ( words . keys ( ) ) )
if args . find :
print ( db . get ( args . find ) )
if args . popular :
for item in db . most_popular ( 300 ) :
print ( item )
print ( db . total_count ( ) )
if args . index :
db . reset ( )
index ( )