使用说明
chmod 755 build-index.cgi simple-search.cgi
cpan HTML::Parser JSON
/var/www/cgi-bin/Artocarpus/build-index.cgi
0 2 * * * /var/www/cgi-bin/Artocarpus/build-index.cgi > /dev/null 2>&1
这种方法对于小型网站(少于100个页面)是最理想的,因为它实现简单、无需额外依赖,同时搜索速度也很快。索引是一个简单的JSON文件,易于调试和理解。
合成一个cgi(索引和搜索)
#!/usr/bin/perl
use strict;
use warnings;
use CGI qw/:standard/;
use CGI::Carp qw(fatalsToBrowser);
use HTML::Entities;
use File::Find;
use JSON;
use Encode qw(decode encode);
use utf8;
# 配置
my $site_root = "/var/www/html/Artocarpus"; # 网站根目录
my $index_file = "/var/www/cgi-bin/Artocarpus/search_index.json"; # 索引文件路径
my @file_ext = qw(.html .htm .php .shtml); # 要索引的文件类型
my $max_results = 10; # 每页显示的最大结果数
my $context_length = 40; # 上下文长度
my $title = "简易网站搜索"; # 页面标题
my $encoding = "utf8"; # 字符编码
# 创建CGI对象
my $cgi = CGI->new();
my $action = $cgi->param('action') || 'search';
my $query = $cgi->param('q') || '';
$query = decode($encoding, $query);
# 输出HTTP头
print $cgi->header(-charset => $encoding);
# 输出页面头部
print <<HTML;
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="$encoding">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>$title</title>
<style>
body {
font-family: Arial, sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
max-width: 1000px;
margin: 0 auto;
}
h1, h2 {
color: #333;
}
.search-form {
margin: 20px 0;
padding: 15px;
background-color: #f5f5f5;
border-radius: 5px;
}
.search-input {
padding: 8px;
width: 70%;
font-size: 16px;
border: 1px solid #ddd;
border-radius: 4px;
}
.search-button {
padding: 8px 15px;
font-size: 16px;
background-color: #4285f4;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
}
.admin-button {
padding: 8px 15px;
font-size: 16px;
background-color: #f44336;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
text-decoration: none;
display: inline-block;
margin-top: 15px;
}
.result {
margin-bottom: 20px;
padding: 10px;
border-bottom: 1px solid #eee;
}
.result-title {
font-size: 18px;
color: #1a0dab;
margin-bottom: 5px;
}
.result-url {
color: #006621;
font-size: 14px;
margin-bottom: 5px;
}
.result-snippet {
font-size: 14px;
}
.highlight {
background-color: #FFFF99;
font-weight: bold;
}
.no-results {
color: #666;
font-style: italic;
}
.message-box {
padding: 15px;
margin: 20px 0;
border-radius: 5px;
}
.success {
background-color: #d4edda;
color: #155724;
}
.error {
background-color: #f8d7da;
color: #721c24;
}
.info {
background-color: #d1ecf1;
color: #0c5460;
}
</style>
</head>
<body>
<h1>$title</h1>
<div class="search-form">
<form method="GET" action="all-in-one-search.cgi">
<input type="hidden" name="action" value="search">
<input type="text" name="q" value="@{[encode_entities($query)]}" class="search-input" placeholder="请输入搜索词..." required>
<input type="submit" value="搜索" class="search-button">
</form>
<a href="all-in-one-search.cgi?action=build_index" class="admin-button">重建索引</a>
</div>
HTML
# 处理索引构建请求
if ($action eq 'build_index') {
print "<div class='message-box info'>";
print "<h2>开始构建索引...</h2>";
print "<p>正在扫描目录:$site_root</p>";
# 存储所有页面信息的数组
my @pages = ();
# 处理文件的函数
sub process_file {
my $file = $_;
my $path = $File::Find::name;
# 只处理指定扩展名的文件
return unless -f $file && grep { $file =~ /$_$/ } @file_ext;
# 将文件路径转换为URL
my $url = $path;
$url =~ s/^$site_root//;
$url = "/$url" unless $url =~ /^\//;
# 读取文件内容
open my $fh, "<:encoding($encoding)", $file or return;
my $content = do { local $/; <$fh> };
close $fh;
# 提取标题
my $title = $file;
if ($content =~ /<title>(.*?)<\/title>/si) {
$title = $1;
$title = decode_entities($title);
}
# 提取文本内容(简单方法:去除HTML标签)
my $text_content = $content;
$text_content =~ s/<script.*?>.*?<\/script>//gis; # 删除脚本
$text_content =~ s/<style.*?>.*?<\/style>//gis; # 删除样式
$text_content =~ s/<[^>]*>//gs; # 删除HTML标签
$text_content =~ s/\s+/ /gs; # 删除多余空白
$text_content = decode_entities($text_content); # 解码HTML实体
# 截取一部分作为摘要(前500个字符)
my $summary = substr($text_content, 0, 500);
# 添加到页面数组
push @pages, {
url => $url,
title => $title,
content => $text_content,
summary => $summary,
last_modified => (stat($file))[9]
};
print "<p>已索引: $url</p>";
}
# 遍历网站目录
find(\&process_file, $site_root);
# 按最后修改时间排序(最新的在前)
@pages = sort { $b->{last_modified} <=> $a->{last_modified} } @pages;
# 将索引写入JSON文件
eval {
open my $out, ">:encoding($encoding)", $index_file or die "无法创建索引文件: $!";
print $out encode_json(\@pages);
close $out;
};
if ($@) {
print "<p class='error'>索引创建失败: $@</p>";
} else {
print "<p>索引完成,共索引了 " . scalar(@pages) . " 个页面。</p>";
print "<p>索引文件保存在: $index_file</p>";
}
print "</div>";
print "<p><a href='all-in-one-search.cgi' class='admin-button'>返回搜索</a></p>";
}
# 处理搜索请求
elsif ($action eq 'search') {
# 如果有搜索词,开始搜索
if ($query) {
# 检查索引文件是否存在
if (-e $index_file) {
# 读取索引文件
eval {
open my $in, "<:encoding($encoding)", $index_file or die "无法读取索引文件: $!";
my $json = do { local $/; <$in> };
close $in;
my $pages = decode_json($json);
# 执行搜索
my @results = ();
foreach my $page (@$pages) {
# 简单匹配(不区分大小写)
if ($page->{title} =~ /$query/i || $page->{content} =~ /$query/i) {
# 提取包含搜索词的片段
my @snippets = ();
my $content = $page->{content};
# 查找并高亮显示匹配项
while ($content =~ /(.{0,$context_length})($query)(.{0,$context_length})/gi) {
my $before = $1;
my $match = $2;
my $after = $3;
my $snippet = $before . '<span class="highlight">' . $match . '</span>' . $after;
push @snippets, $snippet;
# 最多提取3个片段
last if @snippets >= 3;
}
# 如果没有找到片段(例如,匹配在标题中),使用摘要
if (@snippets == 0) {
push @snippets, $page->{summary};
}
push @results, {
url => $page->{url},
title => $page->{title},
snippets => \@snippets
};
}
}
# 显示结果
my $total_results = scalar(@results);
print "<h2>搜索结果: " . ($total_results == 0 ? "没有找到匹配的结果" : "找到 $total_results 个匹配结果") . "</h2>\n";
if ($total_results > 0) {
# 只显示前max_results个结果
my $end = $total_results > $max_results ? $max_results : $total_results;
for my $i (0 .. $end - 1) {
my $result = $results[$i];
print <<HTML;
<div class="result">
<div class="result-title"><a href="$result->{url}">$result->{title}</a></div>
<div class="result-url">$result->{url}</div>
<div class="result-snippet">
HTML
foreach my $snippet (@{$result->{snippets}}) {
print "...$snippet...<br>\n";
}
print <<HTML;
</div>
</div>
HTML
}
# 如果有更多结果,显示提示
if ($total_results > $max_results) {
print "<p>显示前 $max_results 个结果中的 $max_results 个。</p>\n";
}
} else {
print '<p class="no-results">没有找到匹配的结果。请尝试使用不同的搜索词。</p>';
}
};
if ($@) {
print "<div class='message-box error'>";
print "<h3>搜索时发生错误</h3>";
print "<p>$@</p>";
print "</div>";
}
} else {
print "<div class='message-box error'>";
print "<h3>索引文件不存在</h3>";
print "<p>系统尚未创建搜索索引。请点击"重建索引"按钮创建索引。</p>";
print "</div>";
}
}
}
# 输出HTML页脚
print <<HTML;
</body>
</html>