- update to work with newer ruby versions

- switch from using xhdr to xover for additional speed in reading
  article headers
- implement a MAXAGE option to prevent downloading articles older than
  the given number of days
This commit is contained in:
Ward Wouts 2014-01-04 19:20:50 +00:00
parent bb314ebce1
commit f75852302f
3 changed files with 139 additions and 104 deletions

View file

@ -17,9 +17,9 @@
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
require 'set/intspan'
require 'net/nntp'
require 'news/newsrc'
require Pathname.new(__FILE__).dirname + '../set/intspan'
require Pathname.new(__FILE__).dirname + '../net/nntp'
require Pathname.new(__FILE__).dirname + '../news/newsrc'
require 'tempfile'
require 'timeout'
#require 'yaml'
@ -98,11 +98,11 @@ end # module Net
class Article
Debuglevel = 1
Debuglevel = 0
Message = Struct.new(:messid, :id, :date, :from, :server, :subject)
def initialize(nntpservers, groupname, newsrc="~/.newsrc")
def initialize(nntpservers, groupname, newsrc="~/.newsrc", maxage=0)
@messageinfo = []
@grouped = false
@ -113,6 +113,7 @@ def initialize(nntpservers, groupname, newsrc="~/.newsrc")
@cache_buf = {}
@serverlist = []
@serverpasses = {}
@maxage = maxage.to_i
tmplist = nntpservers.split('|')
tmplist.each{ |server|
@ -262,7 +263,9 @@ def get_articles(cachedir=false)
end
if first.to_i <= last.to_i
# available articles on server
# oudste
@connections[server]["first"] = first ? first.to_i : 0
# nieuwste
@connections[server]["last"] = last ? last.to_i : 0
if Debuglevel > 0
puts " Server: #{server} First: #{first} Last: #{last}"
@ -283,15 +286,14 @@ def get_articles(cachedir=false)
# wil wel wat ophalen aangezien logging aantoont dat er wel oudere articles gedownload worden
@connections.keys.each{|server|
if @connections[server]["skip_ids"].max && @connections[server]["skip_ids"].max < @connections[server]["last"]
articles = @connections[server]["last"] - @connections[server]["first"]
if articles > 10000
fillerend = (@connections[server]["skip_ids"].max - (articles/5)).to_i
cnt_articles = @connections[server]["last"] - @connections[server]["first"]
if cnt_articles > 10000
fillerend = (@connections[server]["skip_ids"].max - (cnt_articles/5)).to_i
else
fillerend = @connections[server]["skip_ids"].max - 2000
end
if @connections[server]["skip_ids"].min && fillerend > @connections[server]["skip_ids"].min
@connections[server]["skip_ids"] = @connections[server]["skip_ids"].union("#{@connections[server]["skip_ids"].min}-#{fillerend}")
# p "filling #{@connections[server]["skip_ids"].min}-#{fillerend}"
end
end
}
@ -307,46 +309,45 @@ def get_articles(cachedir=false)
puts "rangelist elements: #{range.diff(@connections[server]["skip_ids"]).elements}" if Debuglevel > 2
begin
unless rangelist == nil or rangelist =~ /^$/
rangelist.split(',').each{|i|
lastdate = DateTime.now
# nieuwste eerst. Dat maakt het mogelijk om op te houden bij te oude datum.
rangelist.split(',').reverse.each{|i|
puts "i: #{i}" if Debuglevel > 2
begin
# resp, xover_lines = get_xover(server, i)
resp, date_lines = get_xhdr(server, i, "date")
resp, subj_lines = get_xhdr(server, i, "subject")
resp, messid_lines = get_xhdr(server, i, "message-id")
resp, from_lines = get_xhdr(server, i, "from")
rescue TempError
resp, xover_lines = get_xover(server, i)
rescue TempError, EOFError
printerr(server)
next
end
art = {}
date_lines.collect{|x|
art[x[0]] = {} unless art.has_key?(x[0])
xover_lines.collect{|x|
id = x[0]
subj = x[1]
auth = x[2]
date = x[3]
messid = x[4]
art[id] = {} unless art.has_key?(id)
begin
art[x[0]]["date"] = DateTime.parse(x[1]).strftime('%Y%m%d')
lastdate = art[id]["date"] = DateTime.parse(date).strftime('%Y%m%d')
rescue
puts $!.message
puts x[1]
art[x[0]]["date"] = Time.now.strftime('%Y%m%d')
puts id
art[id]["date"] = Time.now.strftime('%Y%m%d')
end
puts "art id: #{x[0]} date: #{x[1]}" if Debuglevel > 2
}
subj_lines.collect{|x|
art[x[0]] = {} unless art.has_key?(x[0])
art[x[0]]["subject"] = x[1]
puts "art id: #{x[0]} subj: #{x[1]}" if Debuglevel > 2
}
messid_lines.collect{|x|
art[x[0]] = {} unless art.has_key?(x[0])
art[x[0]]["messid"] = x[1]
puts "art id: #{x[0]} messid: #{x[1]}" if Debuglevel > 2
}
from_lines.collect{|x|
art[x[0]] = {} unless art.has_key?(x[0])
art[x[0]]["from"] = x[1]
puts "art id: #{x[0]} from: #{x[1]}" if Debuglevel > 2
}
art[id]["subject"] = x[1]
art[id]["messid"] = messid
art[id]["from"] = auth
puts "art id: #{id} subj: #{subj}" if Debuglevel > 2
puts "art id: #{id} from: #{auth}" if Debuglevel > 2
puts "art id: #{id} date: #{date}" if Debuglevel > 2
puts "art id: #{id} messid: #{messid}" if Debuglevel > 2
} # xover_lines.collect
art.keys.each{|id|
if art[id].has_key?("date") and art[id].has_key?("subject") and art[id].has_key?("messid") and art[id].has_key?("from")
puts "adding: #{art[id]["messid"]}, #{id}, #{server}, #{art[id]["from"]}, #{art[id]["subject"]}" if Debuglevel > 2
@ -359,6 +360,12 @@ def get_articles(cachedir=false)
cache_add(cachedir, id, art[id]["messid"], art[id]["date"], art[id]["from"], art[id]["subject"], server)
end
}
if @maxage and @maxage > 0
if DateTime.parse(lastdate) < ( DateTime.now - @maxage )
puts "Skipping articles older than #{DateTime.now - @maxage}"
break
end
end
}
end
rescue PermError
@ -407,6 +414,7 @@ end
def get_xhdr(server, range, header)
timedout = 0
attempts = 0
resp = ""
lines = []
begin
@ -432,11 +440,18 @@ def get_xhdr(server, range, header)
else
puts "Won't handle this... yet :("
end
rescue Errno::EPIPE, Errno::ECONNRESET, EOFError
#rescue Errno::EPIPE, Errno::ECONNRESET, EOFError
rescue Errno::EPIPE, Errno::ECONNRESET
printerr(server)
reconnect(server)
get_group_info(server)
attempts += 1
if attempts < 2
retry
else
printerr "giving up"
return
end
end
end
return resp, lines
@ -486,6 +501,11 @@ def get_xover(server, range)
reconnect(server)
get_group_info(server)
retry
rescue Net::NNTPDataError
printerr(server)
reconnect(server)
get_group_info(server)
retry
end
end
return resp, lines
@ -518,7 +538,7 @@ def get_body(server, message)
resp, id, messid, list = @connections[server]["nntp"].body(message)
rescue Net::NNTPReplyError
a = ''
a += $!
a += $!.to_s
printerr(server)
if retries == 0 && (a =~ /^503/ || a =~ /^400/)
reconnect(server)
@ -826,7 +846,6 @@ def cache_read(cachedir)
puts "#{Time.now} Reading & scrubbing caches"
filename = "#{cachedir}/#{@group}.ripnewscache"
excludes = {}
regexp = Regexp.new('^([^\|]*)\|([^\|]*)\|([^\|]*)\|([^\|]*)\|(.*)')
@connections.keys.each{|server|
first = @connections[server]["first"]
last = @connections[server]["last"]
@ -836,13 +855,24 @@ puts " #{Time.now} Reading cache for #{server}"
@connections[server]["skip_ids"].elements.collect!{|x| excludes[server][x]=true}
if FileTest.directory?(cachedir) and FileTest.file?( "#{filename}.#{server}" ) and FileTest.readable?( "#{filename}.#{server}" )
outfile = File.new("#{filename}.#{server}.new", "w") or puts "Couldn't open cachefile for writing"
File.new( "#{filename}.#{server}" ).each{ |line|
line =~ regexp
id_i = $1.to_i
messid = $2
date = $3
from = $4
subject = $5
cachefile = File.new( "#{filename}.#{server}" )
begin
while true do
# using each on a big cachefile leads to out of memory conditions
line = cachefile.readline
begin
line.encode!("US-ASCII")
rescue
# If it gets here, the original encoding is unknown
# gracefully give up and go to the next line
puts "String#encode couldn't handle: '#{line}'"
next
# so the following probably won't help
#line.gsub!(/\\/, "")
end
id_i, messid, date, from, subject = line.split('|', 5)
id_i = id_i.to_i
if first <= id_i and id_i <= last
if ! excludes[server].has_key?(id_i)
outfile.puts(line)
@ -853,8 +883,10 @@ puts " #{Time.now} Reading cache for #{server}"
@connections[server]["skip_ids"].insert!(id_i)
end
end
}
if ( File.move("#{filename}.#{server}.new", "#{filename}.#{server}") )
end
rescue EOFError
end
if ( FileUtils.move("#{filename}.#{server}.new", "#{filename}.#{server}") )
puts " #{Time.now} Cache scrubbed for #{server}"
else
puts "Couldn't scrub #{server} cache"

View file

@ -17,7 +17,7 @@
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
require "set/intspan"
require Pathname.new(__FILE__).dirname + "../set/intspan"
#require "thread"
module News
@ -93,7 +93,7 @@ def save_as(file)
# @@save_lock.synchronize{
if FileTest.exists?("#{file}")
begin
File.rename(file, "#{file}.bak")
FileUtils.mv(file, "#{file}.bak")
rescue
puts "Can't rename #{file}, #{file}.bak: #{$!}"
exit
@ -142,7 +142,9 @@ def save_group_as(file, group)
#p Time.now
#p "copy file"
if FileTest.exists?("#{file}")
if ( ! File.copy(file, "#{file}.bak") )
begin
FileUtils.copy(file, "#{file}.bak")
rescue
puts "Can't copy #{file} to #{file}.bak: #{$!}"
end
end
@ -176,7 +178,9 @@ def save_group_as(file, group)
puts "Newsrc.parse: Bad newsrc line: #{line}"
# restore backup on failure, it'll contain the flaw too, but it'll
# be complete
if ( ! File.copy("#{file}.bak", file) )
begin
FileUtils.copy("#{file}.bak", file)
rescue
puts "Can't copy #{file}.bak to #{file}: #{$!}"
end
exit

View file

@ -23,13 +23,14 @@ require 'date'
require 'ftools'
require 'time'
require 'getoptlong'
require 'news/article'
require 'news/newsrc'
require 'pathname'
require Pathname.new(__FILE__).dirname + 'news/article'
require Pathname.new(__FILE__).dirname + 'news/newsrc'
require 'tempfile'
require 'thread'
require 'thwait'
require 'encode/uuencode'
require 'encode/yenc'
require Pathname.new(__FILE__).dirname + 'encode/uuencode'
require Pathname.new(__FILE__).dirname + 'encode/yenc'
###########################################################################
@ -100,9 +101,10 @@ def save_file(dir, name, data)
case data.class.to_s
when "String"
puts " Moving #{data} to #{dir}/#{newname}"
if File.move(data, "#{dir}/#{newname}")
begin
FileUtils.mv(data, "#{dir}/#{newname}")
puts " Saved as: '#{newname}'"
else
rescue
puts "couldn't rename tempfile"
return false
end
@ -117,9 +119,10 @@ def save_file(dir, name, data)
end
when "Tempfile"
puts " Moving #{data} to #{dir}/#{newname}"
if File.move(data.path, "#{dir}/#{newname}")
begin
FileUtils.mv(data.path, "#{dir}/#{newname}")
puts " Saved as: '#{newname}'"
else
rescue
puts "couldn't rename tempfile #{data.path}"
return false
end
@ -145,25 +148,22 @@ def filename_sanitize(nname)
# XXX this is a mac problem, should be configurable!!!!
group = @config.keys[0]
if @config[group].has_key?("ASCIIFILENAMES") && @config[group]["ASCIIFILENAMES"]
require 'iconv'
begin
# hmmm, misschien is de begin waarde wel gewoon geen UTF-8, maar 8-bit ASCII
# wat de mac filesystems nog slechter maakt overigens
nname = Iconv.conv("ASCII//TRANSLIT", "UTF-8", nname)
rescue Iconv::IllegalSequence
begin
nname = Iconv.conv("ASCII//TRANSLIT", "ISO-8859-1", nname)
rescue Iconv::IllegalSequence
#nname = Iconv.conv("ASCII//TRANSLIT", "UTF-8", nname)
nname.encode!("US-ASCII")
rescue
# If it gets here, the original encoding is not UTF-8 of ISO-8859-1
# so the following probably won't help
puts "Iconv couldn't handle: '#{nname}'"
puts "String#encode couldn't handle: '#{nname}'"
nname.gsub!(/\\\d+/, "#")
nname.gsub!(/ä/, "\"a")
nname.gsub!(/ë/, "\"e")
nname.gsub!(/ï/, "\"i")
nname.gsub!(/ö/, "\"o")
nname.gsub!(/ü/, "\"u")
nname.gsub!(/ñ/, "~n")
#nname.gsub!(/ä/, "\"a")
#nname.gsub!(/ë/, "\"e")
#nname.gsub!(/ï/, "\"i")
#nname.gsub!(/ö/, "\"o")
#nname.gsub!(/ü/, "\"u")
#nname.gsub!(/ñ/, "~n")
nname.gsub!(/#{86.chr}/, "U")
nname.gsub!(/#{151.chr}/, " ")
nname.gsub!(/#{161.chr}/, "i")
@ -178,7 +178,6 @@ def filename_sanitize(nname)
nname.gsub!(/#{243.chr}/, "o")
nname.gsub!(/#{247.chr}/, "S")
end
end
# shit that gets through...
nname.gsub!('#{146.chr}', "")
end
@ -429,7 +428,7 @@ def lock
exit
rescue Errno::ESRCH
puts "Stale lock found... removing..."
File.unlink(@config[group]["LOCKFILE"])
FileUtils.rm(@config[group]["LOCKFILE"])
end
else
puts "Empty lockfile found... removing..."
@ -568,7 +567,7 @@ def get_multi(subj, group)
while FileTest.exists?("#{tbodybase}-#{i}")
i += 1
end
File.move(tbody, "#{tbodybase}-#{i}")
FileUtils.mv(tbody, "#{tbodybase}-#{i}")
tbody = "#{tbodybase}-#{i}"
tfile.close
tfileout.close(false)
@ -666,7 +665,7 @@ def get_max_file_length(tempdir=".")
name = "#$$#{name}"
begin
file = File.new("#{tempdir}/#{name}", "w", 0644).close
File.delete("#{tempdir}/#{name}")
FileUtils.rm("#{tempdir}/#{name}")
rescue Errno::ENAMETOOLONG
name = name[0...-1]
retry
@ -753,7 +752,7 @@ def main
@newsrc_lock = Mutex.new
profile_mem("#{group} start")
puts "\nGetting articles for #{group}"
@articles = Article.new(@config[group]["NNTPSERVER"], group, @config[group]["NEWSRCNAME"])
@articles = Article.new(@config[group]["NNTPSERVER"], group, @config[group]["NEWSRCNAME"], @config[group]["MAXAGE"])
fill_preselector(group)
puts "initialized"
@articles.get_articles(@config[group]["CACHEDIR"])