import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize from collections import Counter nltk.download('punkt') nltk.download('stopwords') def preprocess_text(text): # Tokenize text into words words = word_tokenize(text.lower()) # Remove stop words stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word not in stop_words] # Stemming stemmer = PorterStemmer() stemmed_words = [stemmer.stem(word) for word in filtered_words] return stemmed_words def calculate_similarity(text1, text2): # Preprocess texts words1 = preprocess_text(text1) words2 = preprocess_text(text2) # Calculate similarity using cosine similarity vector1 = Counter(words1) vector2 = Counter(words2) intersection = set(vector1.keys()) & set(vector2.keys()) numerator = sum([vector1[word] * vector2[word] for word in intersection]) sum1 = sum([vector1[word] ** 2 for word in vector1.keys()]) sum2 = sum([vector2[word] ** 2 for word in vector2.keys()]) denominator = (sum1 ** 0.5) * (sum2 ** 0.5) if not denominator: return 0.0 else: return float(numerator) / denominator def check_plagiarism(blog_post, existing_sources): similarity_scores = {} for source_name, source_content in existing_sources.items(): similarity_score = calculate_similarity(blog_post, source_content) similarity_scores[source_name] = similarity_score return similarity_scores # Example usage blog_post = """ Your blog post content goes here. This is just a placeholder text. """ existing_sources = { "Source 1": "Existing source 1 content", "Source 2": "Existing source 2 content", # Add more existing sources as needed } similarity_scores = check_plagiarism(blog_post, existing_sources) print("Plagiarism Check Results:") for source_name, similarity_score in similarity_scores.items(): print(f"{source_name}: {similarity_score}")