From f75852302f59d3192a3a70625150e6802324c228 Mon Sep 17 00:00:00 2001 From: Ward Wouts Date: Sat, 4 Jan 2014 19:20:50 +0000 Subject: [PATCH] - update to work with newer ruby versions - switch from using xhdr to xover for additional speed in reading article headers - implement a MAXAGE option to prevent downloading articles older than the given number of days --- trunk/ripnews/news/article.rb | 148 +++++++++++++++++++++------------- trunk/ripnews/news/newsrc.rb | 12 ++- trunk/ripnews/ripnews.rb | 83 ++++++++++--------- 3 files changed, 139 insertions(+), 104 deletions(-) diff --git a/trunk/ripnews/news/article.rb b/trunk/ripnews/news/article.rb index 68e1056..05bbf33 100644 --- a/trunk/ripnews/news/article.rb +++ b/trunk/ripnews/news/article.rb @@ -17,9 +17,9 @@ # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # -require 'set/intspan' -require 'net/nntp' -require 'news/newsrc' +require Pathname.new(__FILE__).dirname + '../set/intspan' +require Pathname.new(__FILE__).dirname + '../net/nntp' +require Pathname.new(__FILE__).dirname + '../news/newsrc' require 'tempfile' require 'timeout' #require 'yaml' @@ -98,11 +98,11 @@ end # module Net class Article -Debuglevel = 1 +Debuglevel = 0 Message = Struct.new(:messid, :id, :date, :from, :server, :subject) -def initialize(nntpservers, groupname, newsrc="~/.newsrc") +def initialize(nntpservers, groupname, newsrc="~/.newsrc", maxage=0) @messageinfo = [] @grouped = false @@ -113,6 +113,7 @@ def initialize(nntpservers, groupname, newsrc="~/.newsrc") @cache_buf = {} @serverlist = [] @serverpasses = {} + @maxage = maxage.to_i tmplist = nntpservers.split('|') tmplist.each{ |server| @@ -262,7 +263,9 @@ def get_articles(cachedir=false) end if first.to_i <= last.to_i # available articles on server + # oudste @connections[server]["first"] = first ? first.to_i : 0 + # nieuwste @connections[server]["last"] = last ? last.to_i : 0 if Debuglevel > 0 puts " Server: #{server} First: #{first} Last: #{last}" @@ -283,15 +286,14 @@ def get_articles(cachedir=false) # wil wel wat ophalen aangezien logging aantoont dat er wel oudere articles gedownload worden @connections.keys.each{|server| if @connections[server]["skip_ids"].max && @connections[server]["skip_ids"].max < @connections[server]["last"] - articles = @connections[server]["last"] - @connections[server]["first"] - if articles > 10000 - fillerend = (@connections[server]["skip_ids"].max - (articles/5)).to_i + cnt_articles = @connections[server]["last"] - @connections[server]["first"] + if cnt_articles > 10000 + fillerend = (@connections[server]["skip_ids"].max - (cnt_articles/5)).to_i else fillerend = @connections[server]["skip_ids"].max - 2000 end if @connections[server]["skip_ids"].min && fillerend > @connections[server]["skip_ids"].min @connections[server]["skip_ids"] = @connections[server]["skip_ids"].union("#{@connections[server]["skip_ids"].min}-#{fillerend}") - # p "filling #{@connections[server]["skip_ids"].min}-#{fillerend}" end end } @@ -307,46 +309,45 @@ def get_articles(cachedir=false) puts "rangelist elements: #{range.diff(@connections[server]["skip_ids"]).elements}" if Debuglevel > 2 begin unless rangelist == nil or rangelist =~ /^$/ - rangelist.split(',').each{|i| + lastdate = DateTime.now + # nieuwste eerst. Dat maakt het mogelijk om op te houden bij te oude datum. + rangelist.split(',').reverse.each{|i| puts "i: #{i}" if Debuglevel > 2 begin -# resp, xover_lines = get_xover(server, i) - resp, date_lines = get_xhdr(server, i, "date") - resp, subj_lines = get_xhdr(server, i, "subject") - resp, messid_lines = get_xhdr(server, i, "message-id") - resp, from_lines = get_xhdr(server, i, "from") - rescue TempError + resp, xover_lines = get_xover(server, i) + rescue TempError, EOFError printerr(server) next end art = {} - date_lines.collect{|x| - art[x[0]] = {} unless art.has_key?(x[0]) + xover_lines.collect{|x| + id = x[0] + subj = x[1] + auth = x[2] + date = x[3] + messid = x[4] + + art[id] = {} unless art.has_key?(id) + begin - art[x[0]]["date"] = DateTime.parse(x[1]).strftime('%Y%m%d') + lastdate = art[id]["date"] = DateTime.parse(date).strftime('%Y%m%d') rescue puts $!.message - puts x[1] - art[x[0]]["date"] = Time.now.strftime('%Y%m%d') + puts id + art[id]["date"] = Time.now.strftime('%Y%m%d') end - puts "art id: #{x[0]} date: #{x[1]}" if Debuglevel > 2 - } - subj_lines.collect{|x| - art[x[0]] = {} unless art.has_key?(x[0]) - art[x[0]]["subject"] = x[1] - puts "art id: #{x[0]} subj: #{x[1]}" if Debuglevel > 2 - } - messid_lines.collect{|x| - art[x[0]] = {} unless art.has_key?(x[0]) - art[x[0]]["messid"] = x[1] - puts "art id: #{x[0]} messid: #{x[1]}" if Debuglevel > 2 - } - from_lines.collect{|x| - art[x[0]] = {} unless art.has_key?(x[0]) - art[x[0]]["from"] = x[1] - puts "art id: #{x[0]} from: #{x[1]}" if Debuglevel > 2 - } + art[id]["subject"] = x[1] + art[id]["messid"] = messid + art[id]["from"] = auth + + puts "art id: #{id} subj: #{subj}" if Debuglevel > 2 + puts "art id: #{id} from: #{auth}" if Debuglevel > 2 + puts "art id: #{id} date: #{date}" if Debuglevel > 2 + puts "art id: #{id} messid: #{messid}" if Debuglevel > 2 + + } # xover_lines.collect + art.keys.each{|id| if art[id].has_key?("date") and art[id].has_key?("subject") and art[id].has_key?("messid") and art[id].has_key?("from") puts "adding: #{art[id]["messid"]}, #{id}, #{server}, #{art[id]["from"]}, #{art[id]["subject"]}" if Debuglevel > 2 @@ -359,6 +360,12 @@ def get_articles(cachedir=false) cache_add(cachedir, id, art[id]["messid"], art[id]["date"], art[id]["from"], art[id]["subject"], server) end } + if @maxage and @maxage > 0 + if DateTime.parse(lastdate) < ( DateTime.now - @maxage ) + puts "Skipping articles older than #{DateTime.now - @maxage}" + break + end + end } end rescue PermError @@ -407,6 +414,7 @@ end def get_xhdr(server, range, header) timedout = 0 + attempts = 0 resp = "" lines = [] begin @@ -432,11 +440,18 @@ def get_xhdr(server, range, header) else puts "Won't handle this... yet :(" end - rescue Errno::EPIPE, Errno::ECONNRESET, EOFError + #rescue Errno::EPIPE, Errno::ECONNRESET, EOFError + rescue Errno::EPIPE, Errno::ECONNRESET printerr(server) reconnect(server) get_group_info(server) - retry + attempts += 1 + if attempts < 2 + retry + else + printerr "giving up" + return + end end end return resp, lines @@ -486,6 +501,11 @@ def get_xover(server, range) reconnect(server) get_group_info(server) retry + rescue Net::NNTPDataError + printerr(server) + reconnect(server) + get_group_info(server) + retry end end return resp, lines @@ -518,7 +538,7 @@ def get_body(server, message) resp, id, messid, list = @connections[server]["nntp"].body(message) rescue Net::NNTPReplyError a = '' - a += $! + a += $!.to_s printerr(server) if retries == 0 && (a =~ /^503/ || a =~ /^400/) reconnect(server) @@ -826,7 +846,6 @@ def cache_read(cachedir) puts "#{Time.now} Reading & scrubbing caches" filename = "#{cachedir}/#{@group}.ripnewscache" excludes = {} - regexp = Regexp.new('^([^\|]*)\|([^\|]*)\|([^\|]*)\|([^\|]*)\|(.*)') @connections.keys.each{|server| first = @connections[server]["first"] last = @connections[server]["last"] @@ -836,25 +855,38 @@ puts " #{Time.now} Reading cache for #{server}" @connections[server]["skip_ids"].elements.collect!{|x| excludes[server][x]=true} if FileTest.directory?(cachedir) and FileTest.file?( "#{filename}.#{server}" ) and FileTest.readable?( "#{filename}.#{server}" ) outfile = File.new("#{filename}.#{server}.new", "w") or puts "Couldn't open cachefile for writing" - File.new( "#{filename}.#{server}" ).each{ |line| - line =~ regexp - id_i = $1.to_i - messid = $2 - date = $3 - from = $4 - subject = $5 - if first <= id_i and id_i <= last - if ! excludes[server].has_key?(id_i) - outfile.puts(line) - if preselect(subject) - add(id_i, messid, date, from, subject, server) + cachefile = File.new( "#{filename}.#{server}" ) + begin + while true do + # using each on a big cachefile leads to out of memory conditions + line = cachefile.readline + begin + line.encode!("US-ASCII") + rescue + # If it gets here, the original encoding is unknown + # gracefully give up and go to the next line + puts "String#encode couldn't handle: '#{line}'" + next + # so the following probably won't help + #line.gsub!(/\\/, "") + end + + id_i, messid, date, from, subject = line.split('|', 5) + id_i = id_i.to_i + if first <= id_i and id_i <= last + if ! excludes[server].has_key?(id_i) + outfile.puts(line) + if preselect(subject) + add(id_i, messid, date, from, subject, server) + end + # XXX alle traagheid van de cache_read zit in deze regel: + @connections[server]["skip_ids"].insert!(id_i) end - # XXX alle traagheid van de cache_read zit in deze regel: - @connections[server]["skip_ids"].insert!(id_i) end end - } - if ( File.move("#{filename}.#{server}.new", "#{filename}.#{server}") ) + rescue EOFError + end + if ( FileUtils.move("#{filename}.#{server}.new", "#{filename}.#{server}") ) puts " #{Time.now} Cache scrubbed for #{server}" else puts "Couldn't scrub #{server} cache" diff --git a/trunk/ripnews/news/newsrc.rb b/trunk/ripnews/news/newsrc.rb index b24fd5f..270349d 100644 --- a/trunk/ripnews/news/newsrc.rb +++ b/trunk/ripnews/news/newsrc.rb @@ -17,7 +17,7 @@ # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # -require "set/intspan" +require Pathname.new(__FILE__).dirname + "../set/intspan" #require "thread" module News @@ -93,7 +93,7 @@ def save_as(file) # @@save_lock.synchronize{ if FileTest.exists?("#{file}") begin - File.rename(file, "#{file}.bak") + FileUtils.mv(file, "#{file}.bak") rescue puts "Can't rename #{file}, #{file}.bak: #{$!}" exit @@ -142,7 +142,9 @@ def save_group_as(file, group) #p Time.now #p "copy file" if FileTest.exists?("#{file}") - if ( ! File.copy(file, "#{file}.bak") ) + begin + FileUtils.copy(file, "#{file}.bak") + rescue puts "Can't copy #{file} to #{file}.bak: #{$!}" end end @@ -176,7 +178,9 @@ def save_group_as(file, group) puts "Newsrc.parse: Bad newsrc line: #{line}" # restore backup on failure, it'll contain the flaw too, but it'll # be complete - if ( ! File.copy("#{file}.bak", file) ) + begin + FileUtils.copy("#{file}.bak", file) + rescue puts "Can't copy #{file}.bak to #{file}: #{$!}" end exit diff --git a/trunk/ripnews/ripnews.rb b/trunk/ripnews/ripnews.rb index ba79ba3..0c9fcbb 100755 --- a/trunk/ripnews/ripnews.rb +++ b/trunk/ripnews/ripnews.rb @@ -23,13 +23,14 @@ require 'date' require 'ftools' require 'time' require 'getoptlong' -require 'news/article' -require 'news/newsrc' +require 'pathname' +require Pathname.new(__FILE__).dirname + 'news/article' +require Pathname.new(__FILE__).dirname + 'news/newsrc' require 'tempfile' require 'thread' require 'thwait' -require 'encode/uuencode' -require 'encode/yenc' +require Pathname.new(__FILE__).dirname + 'encode/uuencode' +require Pathname.new(__FILE__).dirname + 'encode/yenc' ########################################################################### @@ -100,9 +101,10 @@ def save_file(dir, name, data) case data.class.to_s when "String" puts " Moving #{data} to #{dir}/#{newname}" - if File.move(data, "#{dir}/#{newname}") + begin + FileUtils.mv(data, "#{dir}/#{newname}") puts " Saved as: '#{newname}'" - else + rescue puts "couldn't rename tempfile" return false end @@ -117,9 +119,10 @@ def save_file(dir, name, data) end when "Tempfile" puts " Moving #{data} to #{dir}/#{newname}" - if File.move(data.path, "#{dir}/#{newname}") + begin + FileUtils.mv(data.path, "#{dir}/#{newname}") puts " Saved as: '#{newname}'" - else + rescue puts "couldn't rename tempfile #{data.path}" return false end @@ -145,39 +148,35 @@ def filename_sanitize(nname) # XXX this is a mac problem, should be configurable!!!! group = @config.keys[0] if @config[group].has_key?("ASCIIFILENAMES") && @config[group]["ASCIIFILENAMES"] - require 'iconv' begin # hmmm, misschien is de begin waarde wel gewoon geen UTF-8, maar 8-bit ASCII # wat de mac filesystems nog slechter maakt overigens - nname = Iconv.conv("ASCII//TRANSLIT", "UTF-8", nname) - rescue Iconv::IllegalSequence - begin - nname = Iconv.conv("ASCII//TRANSLIT", "ISO-8859-1", nname) - rescue Iconv::IllegalSequence - # If it gets here, the original encoding is not UTF-8 of ISO-8859-1 - # so the following probably won't help - puts "Iconv couldn't handle: '#{nname}'" - nname.gsub!(/\\\d+/, "#") - nname.gsub!(/ä/, "\"a") - nname.gsub!(/ë/, "\"e") - nname.gsub!(/ï/, "\"i") - nname.gsub!(/ö/, "\"o") - nname.gsub!(/ü/, "\"u") - nname.gsub!(/ñ/, "~n") - nname.gsub!(/#{86.chr}/, "U") - nname.gsub!(/#{151.chr}/, " ") - nname.gsub!(/#{161.chr}/, "i") - nname.gsub!(/#{164.chr}/, "") - nname.gsub!(/#{171.chr}/, " ") - nname.gsub!(/#{180.chr}/, " ") - nname.gsub!(/#{183.chr}/, " ") - nname.gsub!(/#{187.chr}/, " ") - nname.gsub!(/#{227.chr}/, "~a") - nname.gsub!(/#{229.chr}/, "a") - nname.gsub!(/#{231.chr}/, "c") - nname.gsub!(/#{243.chr}/, "o") - nname.gsub!(/#{247.chr}/, "S") - end + #nname = Iconv.conv("ASCII//TRANSLIT", "UTF-8", nname) + nname.encode!("US-ASCII") + rescue + # If it gets here, the original encoding is not UTF-8 of ISO-8859-1 + # so the following probably won't help + puts "String#encode couldn't handle: '#{nname}'" + nname.gsub!(/\\\d+/, "#") + #nname.gsub!(/ä/, "\"a") + #nname.gsub!(/ë/, "\"e") + #nname.gsub!(/ï/, "\"i") + #nname.gsub!(/ö/, "\"o") + #nname.gsub!(/ü/, "\"u") + #nname.gsub!(/ñ/, "~n") + nname.gsub!(/#{86.chr}/, "U") + nname.gsub!(/#{151.chr}/, " ") + nname.gsub!(/#{161.chr}/, "i") + nname.gsub!(/#{164.chr}/, "") + nname.gsub!(/#{171.chr}/, " ") + nname.gsub!(/#{180.chr}/, " ") + nname.gsub!(/#{183.chr}/, " ") + nname.gsub!(/#{187.chr}/, " ") + nname.gsub!(/#{227.chr}/, "~a") + nname.gsub!(/#{229.chr}/, "a") + nname.gsub!(/#{231.chr}/, "c") + nname.gsub!(/#{243.chr}/, "o") + nname.gsub!(/#{247.chr}/, "S") end # shit that gets through... nname.gsub!('#{146.chr}', "") @@ -429,7 +428,7 @@ def lock exit rescue Errno::ESRCH puts "Stale lock found... removing..." - File.unlink(@config[group]["LOCKFILE"]) + FileUtils.rm(@config[group]["LOCKFILE"]) end else puts "Empty lockfile found... removing..." @@ -568,7 +567,7 @@ def get_multi(subj, group) while FileTest.exists?("#{tbodybase}-#{i}") i += 1 end - File.move(tbody, "#{tbodybase}-#{i}") + FileUtils.mv(tbody, "#{tbodybase}-#{i}") tbody = "#{tbodybase}-#{i}" tfile.close tfileout.close(false) @@ -666,7 +665,7 @@ def get_max_file_length(tempdir=".") name = "#$$#{name}" begin file = File.new("#{tempdir}/#{name}", "w", 0644).close - File.delete("#{tempdir}/#{name}") + FileUtils.rm("#{tempdir}/#{name}") rescue Errno::ENAMETOOLONG name = name[0...-1] retry @@ -753,7 +752,7 @@ def main @newsrc_lock = Mutex.new profile_mem("#{group} start") puts "\nGetting articles for #{group}" - @articles = Article.new(@config[group]["NNTPSERVER"], group, @config[group]["NEWSRCNAME"]) + @articles = Article.new(@config[group]["NNTPSERVER"], group, @config[group]["NEWSRCNAME"], @config[group]["MAXAGE"]) fill_preselector(group) puts "initialized" @articles.get_articles(@config[group]["CACHEDIR"])