From f54d8cb0b446cc6bd5c119a250dd3628545f0f94 Mon Sep 17 00:00:00 2001 From: Ward Wouts Date: Thu, 14 Oct 2004 11:46:31 +0000 Subject: [PATCH] major internal datastructure overhaul. major memory improvements. Will need to make a cache cleaning mechanism now --- trunk/ripnews/news/article.rb | 311 +++++++++++++++++----------------- 1 file changed, 159 insertions(+), 152 deletions(-) diff --git a/trunk/ripnews/news/article.rb b/trunk/ripnews/news/article.rb index bf7a925..2b916cf 100644 --- a/trunk/ripnews/news/article.rb +++ b/trunk/ripnews/news/article.rb @@ -1,4 +1,4 @@ -# $Dwarf: article.rb,v 1.88 2004/06/16 08:16:23 ward Exp $ +# $Dwarf: article.rb,v 1.89 2004/09/01 11:25:46 ward Exp $ # $Source$ # @@ -22,6 +22,7 @@ require 'net/nntp' require 'news/newsrc' require 'tempfile' require 'timeout' +#require 'yaml' class ArticleError < RuntimeError; end class TempError < ArticleError; end @@ -29,26 +30,37 @@ class PermError < ArticleError; end class Article -Debuglevel = 0 +Debuglevel = 1 + +Message = Struct.new(:messid, :id, :server, :subject) def initialize(nntpservers, groupname, newsrc="~/.newsrc") - @messids = [] - @ids = [] - @servers = [] - @subjects = [] + @messageinfo = [] - @sorted = false @grouped = false @groups = {} @gotten = {} @group = groupname + @preselectpatterns = [] + @newids = {} @serverlist = nntpservers.split('|') @connections = {} @serverlist.collect{|server| @connections[server] = {} + @newids[server] = {} begin - @connections[server]["nntp"] = Net::NNTP.new(server) + p server + p Time.now + begin + timeout(180) do + @connections[server]["nntp"] = Net::NNTP.new(server) + end + rescue TimeoutError + puts "Timeout due to sucky server, reconnecting" + retry + end + p Time.now @connections[server]["skip_ids"] = Set::IntSpan.new() @connections[server]["newsrc"] = News::Newsrc.new("#{newsrc}.#{server}") set_skip_ids(server, @connections[server]["newsrc"].marked_articles(@group)) @@ -80,16 +92,41 @@ def reconnect(server) print "Succesfully reconnected to #{server}\n" end +def memusage + print "memprof:\n" + print "global:\n" +# for i in global_variables +# print "#{i}\n" +# end +# print "local:\n" +# for i in local_variables +# print "#{i}\n" +# end + for i in self.instance_variables + puts i + print "X: " + begin + puts self.instance_eval(i).size + rescue NoMethodError + end + end +end + +def add_preselect_pattern(regexp) + @preselectpatterns.push(regexp) +end + +def preselect(subject) + @preselectpatterns.collect{|regexp| + if regexp.match(subject) != nil + return true + end + } + return false +end + def add(messid, id, server, subject) -# print "Messid: #{messid}\n" -# print "Id: #{id}\n" -# print "Server: #{server}\n" -# print "Subject: #{subject}\n" - @messids.push(messid) - @ids.push(id.to_i) - @servers.push(server) - @subjects.push(subject) - @sorted = false + @messageinfo.push(Message.new(messid, id.to_i, server, subject)) @grouped = false end @@ -169,6 +206,7 @@ def get_articles(cachedir=false) for id in art.keys if art[id].has_key?("subject") and art[id].has_key?("messid") print "adding: #{art[id]["messid"]}, #{id}, #{server}, #{art[id]["subject"]}\n" if Debuglevel > 2 + @newids[server][id.to_i] = true add(art[id]["messid"], id, server, art[id]["subject"]) end end @@ -268,31 +306,6 @@ def get_xhdr(server, range, header) end end -# if xhdr doesn't work, this should be used -# for i in (range.diff(@connections[server]["skip_ids"]).elements) -# begin -# @connections[server]["nntp"].stat(i) -# resp, id, messid, list = @connections[server]["nntp"].head(i) -# for j in list -# if j =~ /Subject: (.*)/ -# subj=$1 -# end -# end -# print "get_articles messid: #{messid}\n" if Debuglevel > 1 -# print "get_articles id: #{id}\n" if Debuglevel > 1 -# print "get_articles server: #{server}\n" if Debuglevel > 1 -# print "get_articles subject: #{subj}\n" if Debuglevel > 1 -# add(messid, id, server, subj) -# rescue Net::NNTP::RuntimeError -# print "whoopsie couldn't stat #{i}\n" if Debuglevel > 1 -# end -# end - -def get_groups - group_subjects unless @grouped - return @groups -end - def get_groupname return @group end @@ -344,29 +357,31 @@ end def get_group_body(subj) result = [] group_subject_sort(subj) - return false if @groups[subj]["messages"] == nil - for i in (0...@groups[subj]["messages"].length) - unless @gotten.has_key?(@groups[subj]["messages"][i]) +# puts @groups[subj].to_yaml + return false if @groups[subj]["messageinfo"] == nil + for i in (0...@groups[subj]["messageinfo"].length) + unless @gotten.has_key?(@groups[subj]["messageinfo"][i][:messid]) print "getting article: #{i}\n" if Debuglevel > 1 print "getting article: #{subj}\n" if Debuglevel > 1 - print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0 - print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1 - print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1 - print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0 + print "full subject: #{@groups[subj]["messageinfo"][i][:subject]}\n" if Debuglevel > 0 + print "message id: #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 1 + print "id: #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 1 + print "server: #{@groups[subj]["messageinfo"][i][:server]}\n" if Debuglevel > 0 resp = false while resp == false - if @serverlist.include?(@groups[subj]["servers"][i]) - resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i]) + if @serverlist.include?(@groups[subj]["messageinfo"][i][:server]) + resp, id, messid, list = get_body(@groups[subj]["messageinfo"][i][:server], @groups[subj]["messageinfo"][i][:messid]) else resp = false end if resp == false if Debuglevel > 1 print "mess-id i: #{@groups[subj]["messages"][i]}\n" - print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n" + # XXX dit moet netter kunnen + print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" if @groups[subj]["messageinfo"][i+1] != nil end - if (i+1 < @groups[subj]["messages"].length) and - (@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1]) + if (i+1 < @groups[subj]["messageinfo"].length) and + (@groups[subj]["messageinfo"][i][:messid] == @groups[subj]["messageinfo"][i+1][:messid]) print " Trying next server...\n" i += 1 else @@ -374,7 +389,7 @@ def get_group_body(subj) end end end - @gotten[ @groups[subj]["messages"][i] ] = true + @gotten[ @groups[subj]["messageinfo"][i][:messid] ] = true result = list end end @@ -384,23 +399,24 @@ end def get_group_body_first(subj) group_subject_sort(subj) i = 0 - unless @groups[subj]["messages"] != nil && @groups[subj]["messages"][0] + unless @groups[subj]["messageinfo"] != nil && @groups[subj]["messageinfo"][0][:messid] return false end - while @gotten.has_key?(@groups[subj]["messages"][0]) == false + while @gotten.has_key?(@groups[subj]["messageinfo"][0][:messid]) == false print "getting article: #{subj}\n" if Debuglevel > 0 - print "full subject: #{@groups[subj]["subject"][0]}\n" if Debuglevel > 0 - print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1 - print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1 - print "server: #{@groups[subj]["servers"][0]}\n" if Debuglevel > 0 + print "full subject: #{@groups[subj]["messageinfo"][0][:subject]}\n" if Debuglevel > 0 + print "message id: #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 1 + print "id: #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 1 + print "server: #{@groups[subj]["messageinfo"][0][:server]}\n" if Debuglevel > 0 resp = false while resp == false - resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i]) + resp, id, messid, list = get_body(@groups[subj]["messageinfo"][i][:server], @groups[subj]["messageinfo"][i][:messid]) if resp == false - print "mess-id i: #{@groups[subj]["messages"][i]}\n" - print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n" - if (i+1 < @groups[subj]["messages"].length) and - (@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1]) + print "mess-id i: #{@groups[subj]["messageinfo"][i][:messid]}\n" + # XXX dit moet netter kunnen + print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" if @groups[subj]["messageinfo"][i+1] != nil + if (i+1 < @groups[subj]["messageinfo"].length) and + (@groups[subj]["messageinfo"][i][:messid] == @groups[subj]["messageinfo"][i+1][:messid]) print "Trying next server...\n" i += 1 else @@ -408,29 +424,31 @@ def get_group_body_first(subj) end end end - @gotten[@groups[subj]["messages"][i]] = true + @gotten[@groups[subj]["messageinfo"][i][:messid]] = true end return list end def get_group_body_rest(subj, file=nil) result = [] - for i in (1...@groups[subj]["messages"].length) - unless @gotten.has_key?(@groups[subj]["messages"][i]) + for i in (1...@groups[subj]["messageinfo"].length) + unless @gotten.has_key?(@groups[subj]["messageinfo"][i][:messid]) print "getting article: #{i}\n" if Debuglevel > 1 print "getting article: #{subj}\n" if Debuglevel > 1 - print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0 - print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1 - print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1 - print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0 + print "full subject: #{@groups[subj]["messageinfo"][i][:subject]}\n" if Debuglevel > 0 + print "message id: #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 1 + print "id: #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 1 + print "server: #{@groups[subj]["messageinfo"][i][:server]}\n" if Debuglevel > 0 resp = false while resp == false - resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i]) + resp, id, messid, list = get_body(@groups[subj]["messageinfo"][i][:server], @groups[subj]["messages"][i]) if resp == false - print "mess-id i: #{@groups[subj]["messages"][i]}\n" - print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n" - if (i+1 < @groups[subj]["messages"].length) and - (@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1]) + print "mess-id i: #{@groups[subj]["messageinfo"][i][:messid]}\n" + # print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" + # XXX dit moet netter kunnen + print "mess-id i+1: #{@groups[subj]["messageinfo"][i+1][:messid]}\n" if @groups[subj]["messageinfo"][i+1] != nil + if (i+1 < @groups[subj]["messageinfo"].length) and + (@groups[subj]["messageinfo"][i][:messid] == @groups[subj]["messageinfo"][i+1][:messid]) print "Trying next server...\n" i += 1 else @@ -438,7 +456,7 @@ def get_group_body_rest(subj, file=nil) end end end - @gotten[ @groups[subj]["messages"][i] ] = true + @gotten[ @groups[subj]["messageinfo"][i][:messid] ] = true if file list.collect{|line| file.print "#{line}\n"} else @@ -454,16 +472,15 @@ def get_group_subjects return @groups.keys end -def get_group_messids(subject) - group_subjects unless @grouped - return @groups[subject]["messages"] -end - def group_is_complete(subj) group_subjects unless @grouped #print "Subject: #{subj}\n" - print "length: #{@groups[subj]["messages"].length} total: #{@groups[subj]["total"].to_i}\n" if Debuglevel > 1 - umessids = @groups[subj]["messages"].uniq + print "length: #{@groups[subj]["messageinfo"].length} total: #{@groups[subj]["total"].to_i}\n" if Debuglevel > 1 + messids = [] + @groups[subj]["messageinfo"].each {|x| + messids.push(x[:messid]) + } + umessids = messids.uniq if (umessids.length ) >= @groups[subj]["total"].to_i return true else @@ -479,39 +496,25 @@ def group_is_multipart(subj) @groups[subj]["total"].to_i > 1 end -def get_messids - return @messids -end - -def get_subjects - return @subjects -end - def group_subjects @groups = {} - for i in (0...@subjects.length) - print "group subjects: #{i} #{@subjects[i]}\n" if Debuglevel > 3 - if @subjects[i] =~ /(.*)\((\d+)\/(\d+)\)(.*)/ || @subjects[i] =~ /(.*)\[(\d+)\/(\d+)\](.*)/ + for i in (0...@messageinfo.length) + print "group subjects: #{i} #{@messageinfo[i][:subject]}\n" if Debuglevel > 3 + if @messageinfo[i][:subject] =~ /(.*)\((\d+)\/(\d+)\)(.*)/ || @messageinfo[i][:subject] =~ /(.*)\[(\d+)\/(\d+)\](.*)/ j = "#{$1}#{$4} (#{$3})" number = $2 total = $3 else - j = @subjects[i] + j = @messageinfo[i][:subject] number = 1 total = 1 end if @groups.has_key?(j) and number.to_i != 0 - @groups[j]["messages"].push(@messids[i]) - @groups[j]["ids"].push(@ids[i].to_i) - @groups[j]["servers"].push(@servers[i]) - @groups[j]["subject"].push(@subjects[i]) + @groups[j]["messageinfo"].push(@messageinfo[i]) elsif number.to_i != 0 @groups[j] = {} @groups[j]["total"] = total - @groups[j]["messages"] = [ @messids[i] ] - @groups[j]["ids"] = [ @ids[i].to_i ] - @groups[j]["servers"] = [ @servers[i] ] - @groups[j]["subject"] = [ @subjects[i] ] + @groups[j]["messageinfo"] = [ (@messageinfo[i]) ] end end @grouped = true @@ -528,9 +531,9 @@ end def group_update_newsrc(subject) print "running group_update_newsrc\n"; - for i in (0...@groups[subject]["messages"].length) - if @connections[@groups[subject]["servers"][i]] - @connections[@groups[subject]["servers"][i]]["newsrc"].mark(@group, @groups[subject]["ids"][i]) + for i in (0...@groups[subject]["messageinfo"].length) + if @connections[@groups[subject]["messageinfo"][i][:server]] + @connections[@groups[subject]["messageinfo"][i][:server]]["newsrc"].mark(@group, @groups[subject]["messageinfo"][i][:id]) end end end @@ -548,8 +551,12 @@ def check_cache(cachedir) end def read_cache(cachedir) +p "reading cache" +p Time.now filename = "#{cachedir}/#{@group}.ripnewscache" excludes = {} + # id | messageid | subject + lineregexp = Regexp.new('^(\d+)\|(.*?)\|(.*)$') for server in @connections.keys excludes[server] = {} @connections[server]["skip_ids"].elements.collect!{|x| excludes[server][x]=true} @@ -558,11 +565,14 @@ def read_cache(cachedir) lines = file.readlines lines.collect{|line| # id | messageid | subject - if line =~ /^(\d+)\|(.*?)\|(.*)$/ + #if line =~ /^(\d+)\|(.*?)\|(.*)$/ + if lineregexp.match(line) != nil unless excludes.has_key?(server) and excludes[server].has_key?($1.to_i) or $1.to_i < @connections[server]["first"].to_i or $1.to_i > @connections[server]["last"].to_i - add($2, $1, server, $3) + if preselect($3) + add($2, $1, server, $3) + end @connections[server]["skip_ids"].insert($1.to_i) end end @@ -570,24 +580,28 @@ def read_cache(cachedir) file.close end end +p Time.now +memusage end def save_cache(cachedir, server) +p "writing cache" +p Time.now filename = "#{cachedir}/#{@group}.ripnewscache" if FileTest.directory?( cachedir ) - file = File.new( "#{filename}.#{server}.new", "w" ) or print "couldn't open cachefile for writing\n" + if ! File.copy("#{filename}.#{server}","#{filename}.#{server}.new") + puts "Couldn't renew cache" + end + file = File.new( "#{filename}.#{server}.new", "a+" ) or print "couldn't open cachefile for writing\n" print "Updating cache...\n" cache = [] - for i in (0...@subjects.length) - cache.push("#{@ids[i]}|#{@messids[i]}|#{@subjects[i]}\n") if @servers[i] == server + for i in (0...@messageinfo.length) + if @newids[server].has_key?(@messageinfo[i][:id]) + cache.push("#{@messageinfo[i][:id]}|#{@messageinfo[i][:messid]}|#{@messageinfo[i][:subject]}\n") if @messageinfo[i][:server] == server + end end cache.sort! file.print cache -# for i in (0...@subjects.length) -# if @servers[i] == server -# file.print "#{@ids[i]}|#{@messids[i]}|#{@subjects[i]}\n" -# end -# end file.close if ( File.move("#{filename}.#{server}.new", "#{filename}.#{server}") ) print "Cache updated for #{server}\n" @@ -595,6 +609,7 @@ def save_cache(cachedir, server) print "Couldn't update #{server} cache\n" end end +p Time.now end ############################################################### @@ -615,54 +630,49 @@ end ############################################################### def group_subject_sort(subj) + # XXX Waarom gebruik ik hier eigenlijk sort_arr ipv in place sorting? #print "Sorting articles\n" serverhash = {} for i in (0...@serverlist.length) serverhash[@serverlist[i]] = i end sort_arr = [] - for i in (0...@groups[subj]["subject"].length) - print "subj sort #{@groups[subj]["subject"][i]}\n" if Debuglevel > 2 - print "subj sort #{@groups[subj]["messages"][i]}\n" if Debuglevel > 2 - print "subj sort #{@groups[subj]["ids"][i]}\n" if Debuglevel > 2 - print "subj sort #{@groups[subj]["servers"][i]}\n" if Debuglevel > 2 - sort_arr.push( [ - @groups[subj]["subject"][i].dup, - @groups[subj]["messages"][i].dup, - #@groups[subj]["ids"][i].dup, # eng hoor, dit wijzigen - @groups[subj]["ids"][i], - @groups[subj]["servers"][i].dup - ] ) if serverhash[@groups[subj]["servers"][i]] != nil + for i in (0...@groups[subj]["messageinfo"].length) + print "subj sort #{@groups[subj]["messageinfo"][i][:subject]}\n" if Debuglevel > 2 + print "subj sort #{@groups[subj]["messageinfo"][i][:messid]}\n" if Debuglevel > 2 + print "subj sort #{@groups[subj]["messageinfo"][i][:id]}\n" if Debuglevel > 2 + print "subj sort #{@groups[subj]["messageinfo"][i][:server]}\n" if Debuglevel > 2 + sort_arr.push( + @groups[subj]["messageinfo"][i].dup + ) if serverhash[@groups[subj]["messageinfo"][i][:server]] != nil end sort_arr.sort!{|a,b| - r = ward_sort(a[0], b[0]) - if serverhash[a[3]] == nil or serverhash[b[3]] == nil - print "serverhash[a[3]]: #{serverhash[a[3]]}\n" - print "serverhash[b[3]]: #{serverhash[b[3]]}\n" - print "a[3]: #{a[3]}\n" - print "b[3]: #{a[3]}\n" + r = ward_sort(a[:subject], b[:subject]) + if serverhash[a[:server]] == nil or serverhash[b[:server]] == nil + print "serverhash[a[:server]]: #{serverhash[a[:server]]}\n" + print "serverhash[b[:server]]: #{serverhash[b[:server]]}\n" + print "a[:server]: #{a[:server]}\n" + print "b[:server]: #{a[:server]}\n" print "strange things going on here...\n" end if r == 0 - r = serverhash[a[3]] <=> serverhash[b[3]] + r = serverhash[a[:server]] <=> serverhash[b[:server]] end r } + + # XXX hmmmm lijkt er op dat @groups[x]["total"] hier wel gesloopt wordt... + # XXX maakt blijkbaar niet uit, maar is niet netjes + @groups[subj].clear sort_arr.collect{|i| if @groups[subj].has_key?("messages") - @groups[subj]["subject"].push(i[0]) - @groups[subj]["messages"].push(i[1]) - @groups[subj]["ids"].push(i[2]) - @groups[subj]["servers"].push(i[3]) + @groups[subj]["messageinfo"].push(i) else - @groups[subj]["subject"] = [i[0]] - @groups[subj]["messages"] = [i[1]] - @groups[subj]["ids"] = [i[2]] - @groups[subj]["servers"] = [i[3]] + @groups[subj]["messageinfo"] = [ i ] end - print "subject sort: #{i[0]}\n" if Debuglevel > 2 - print "server: #{i[3]}\n" if Debuglevel > 2 + print "subject sort: #{i[:subject]}\n" if Debuglevel > 2 + print "server: #{i[:server]}\n" if Debuglevel > 2 } #print "Done sorting\n" end @@ -719,10 +729,7 @@ end def quit # just testing if these should be reset... - @messids = [] - @ids = [] - @servers = [] - @subjects = [] + @messageinfo = [] for server in @connections.keys begin