import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_text(text):
# Tokenize text into words
words = word_tokenize(text.lower())
# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
return stemmed_words
def calculate_similarity(text1, text2):
# Preprocess texts
words1 = preprocess_text(text1)
words2 = preprocess_text(text2)
# Calculate similarity using cosine similarity
vector1 = Counter(words1)
vector2 = Counter(words2)
intersection = set(vector1.keys()) & set(vector2.keys())
numerator = sum([vector1[word] * vector2[word] for word in intersection])
sum1 = sum([vector1[word] ** 2 for word in vector1.keys()])
sum2 = sum([vector2[word] ** 2 for word in vector2.keys()])
denominator = (sum1 ** 0.5) * (sum2 ** 0.5)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def check_plagiarism(blog_post, existing_sources):
similarity_scores = {}
for source_name, source_content in existing_sources.items():
similarity_score = calculate_similarity(blog_post, source_content)
similarity_scores[source_name] = similarity_score
return similarity_scores
# Example usage
blog_post = """
Your blog post content goes here.
This is just a placeholder text.
"""
existing_sources = {
"Source 1": "Existing source 1 content",
"Source 2": "Existing source 2 content",
# Add more existing sources as needed
}
similarity_scores = check_plagiarism(blog_post, existing_sources)
print("Plagiarism Check Results:")
for source_name, similarity_score in similarity_scores.items():
print(f"{source_name}: {similarity_score}")
0 Comments